From 3f4f8292ca5cd84ffffdcc8cae2a01975e512c77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gon=C3=A7alo=20Silva?= <goncalossilva@gmail.com>
Date: Sat, 8 Jun 2019 03:02:47 +0100
Subject: [PATCH] Use markup regexp for character count

Deferring to Pandoc is not without its faults. It still requires
processing (eg. horizontal rules turning into 72 dashes). It is
significantly slower and resource hungry.

On the reverse, the markup regexps have improved over time and are able
to handle the task.
---
 uberwriter/markup_regex.py  |  2 +-
 uberwriter/stats_counter.py | 28 ++++++++++++++++++++--------
 2 files changed, 21 insertions(+), 9 deletions(-)
diff --git a/uberwriter/markup_regex.py b/uberwriter/markup_regex.py
index 8738843..7b2e9c0 100644
--- a/uberwriter/markup_regex.py
+++ b/uberwriter/markup_regex.py
@@ -31,6 +31,6 @@ TABLE = re.compile(
 MATH = re.compile(
     r"([$]{1,2})[^` ](?P<text>.+?)[^`\\ ]\1")
 FOOTNOTE_ID = re.compile(
-    r"[^\s]+\[\^(?P<id>[^\s]+)\]")
+    r"[^\s]+\[\^(?P<id>(?P<text>[^\s]+))\]")
 FOOTNOTE = re.compile(
     r"(?:^\n*|\n\n)\[\^(?P<id>[^\s]+)\]: (?P<text>(?:[^\n]+|\n+(?=(?:\t| {4})))+)(?:\n+|$)", re.M)
diff --git a/uberwriter/stats_counter.py b/uberwriter/stats_counter.py
index adeac08..ec54095 100644
--- a/uberwriter/stats_counter.py
+++ b/uberwriter/stats_counter.py
@@ -1,21 +1,19 @@
-import math
 import re
 from queue import Queue
 from threading import Thread
 
 from gi.repository import GLib
 
-from uberwriter import helpers
+from uberwriter.markup_regex import ITALIC, BOLD_ITALIC, BOLD, STRIKETHROUGH, IMAGE, LINK, \
+    HORIZONTAL_RULE, LIST, MATH, TABLE, CODE_BLOCK, HEADER_UNDER, HEADER, BLOCK_QUOTE, ORDERED_LIST, \
+    FOOTNOTE_ID, FOOTNOTE
 
 
 class StatsCounter:
     """Counts characters, words, sentences and read time using a background thread."""
 
-    # Regexp that matches characters, with the following exceptions:
-    # * Newlines
-    # * Sequential spaces
-    # * Sequential dashes
-    CHARACTERS = re.compile(r"[^\s-]|(?:[^\S\n](?!\s)|-(?![-\n]))")
+    # Regexp that matches any character, except for newlines and subsequent spaces.
+    CHARACTERS = re.compile(r"[^\s]|(?:[^\S\n](?!\s))")
 
     # Regexp that matches Asian letters, general symbols and hieroglyphs,
     # as well as sequences of word characters optionally containing non-word characters in-between.
@@ -28,6 +26,17 @@ class StatsCounter:
     # Regexp that matches paragraphs, ie. anything separated by newlines.
     PARAGRAPHS = re.compile(r".+\n?")
 
+    # List of regexp whose matches should be replaced by their "text" group. Order is important.
+    MARKUP_REGEXP_REPLACE = (
+        BOLD_ITALIC, ITALIC, BOLD, STRIKETHROUGH, IMAGE, LINK, LIST, ORDERED_LIST, BLOCK_QUOTE,
+        HEADER, HEADER_UNDER, CODE_BLOCK, TABLE, MATH, FOOTNOTE_ID, FOOTNOTE
+    )
+
+    # List of regexp whose matches should be removed. Order is important.
+    MARKUP_REGEXP_REMOVE = (
+        HORIZONTAL_RULE,
+    )
+
     def __init__(self):
         super().__init__()
 
@@ -59,7 +68,10 @@ class StatsCounter:
                 if self.queue.empty():
                     break
 
-            text = helpers.pandoc_convert(text, to="plain")
+            for regexp in self.MARKUP_REGEXP_REPLACE:
+                text = re.sub(regexp, r"\g<text>", text)
+            for regexp in self.MARKUP_REGEXP_REMOVE:
+                text = re.sub(regexp, "", text)
 
             character_count = len(re.findall(self.CHARACTERS, text))