Use markup regexp for character count

Deferring to Pandoc is not without its faults. It still requires
processing (eg. horizontal rules turning into 72 dashes). It is
significantly slower and resource hungry.

On the reverse, the markup regexps have improved over time and are able
to handle the task.
github/fork/yochananmarqos/patch-1
Gonçalo Silva 2019-06-08 03:02:47 +01:00
parent aae38ddb5f
commit 3f4f8292ca
2 changed files with 21 additions and 9 deletions

View File

@ -31,6 +31,6 @@ TABLE = re.compile(
MATH = re.compile(
r"([$]{1,2})[^` ](?P<text>.+?)[^`\\ ]\1")
FOOTNOTE_ID = re.compile(
r"[^\s]+\[\^(?P<id>[^\s]+)\]")
r"[^\s]+\[\^(?P<id>(?P<text>[^\s]+))\]")
FOOTNOTE = re.compile(
r"(?:^\n*|\n\n)\[\^(?P<id>[^\s]+)\]: (?P<text>(?:[^\n]+|\n+(?=(?:\t| {4})))+)(?:\n+|$)", re.M)

View File

@ -1,21 +1,19 @@
import math
import re
from queue import Queue
from threading import Thread
from gi.repository import GLib
from uberwriter import helpers
from uberwriter.markup_regex import ITALIC, BOLD_ITALIC, BOLD, STRIKETHROUGH, IMAGE, LINK, \
HORIZONTAL_RULE, LIST, MATH, TABLE, CODE_BLOCK, HEADER_UNDER, HEADER, BLOCK_QUOTE, ORDERED_LIST, \
FOOTNOTE_ID, FOOTNOTE
class StatsCounter:
"""Counts characters, words, sentences and read time using a background thread."""
# Regexp that matches characters, with the following exceptions:
# * Newlines
# * Sequential spaces
# * Sequential dashes
CHARACTERS = re.compile(r"[^\s-]|(?:[^\S\n](?!\s)|-(?![-\n]))")
# Regexp that matches any character, except for newlines and subsequent spaces.
CHARACTERS = re.compile(r"[^\s]|(?:[^\S\n](?!\s))")
# Regexp that matches Asian letters, general symbols and hieroglyphs,
# as well as sequences of word characters optionally containing non-word characters in-between.
@ -28,6 +26,17 @@ class StatsCounter:
# Regexp that matches paragraphs, ie. anything separated by newlines.
PARAGRAPHS = re.compile(r".+\n?")
# List of regexp whose matches should be replaced by their "text" group. Order is important.
MARKUP_REGEXP_REPLACE = (
BOLD_ITALIC, ITALIC, BOLD, STRIKETHROUGH, IMAGE, LINK, LIST, ORDERED_LIST, BLOCK_QUOTE,
HEADER, HEADER_UNDER, CODE_BLOCK, TABLE, MATH, FOOTNOTE_ID, FOOTNOTE
)
# List of regexp whose matches should be removed. Order is important.
MARKUP_REGEXP_REMOVE = (
HORIZONTAL_RULE,
)
def __init__(self):
super().__init__()
@ -59,7 +68,10 @@ class StatsCounter:
if self.queue.empty():
break
text = helpers.pandoc_convert(text, to="plain")
for regexp in self.MARKUP_REGEXP_REPLACE:
text = re.sub(regexp, r"\g<text>", text)
for regexp in self.MARKUP_REGEXP_REMOVE:
text = re.sub(regexp, "", text)
character_count = len(re.findall(self.CHARACTERS, text))