From 9807741b13673c6e8b6f8bf5c9c1c3cbf27ded76 Mon Sep 17 00:00:00 2001 From: Wolf Vollprecht Date: Thu, 2 Oct 2014 19:04:22 +0200 Subject: [PATCH] added some of said experimental features --- .gitignore | 1 + uberwriter/UberwriterAutoCorrect.py | 201 +++++ uberwriter_lib/pressagio/VERSION | 1 + uberwriter_lib/pressagio/__init__.py | 34 + uberwriter_lib/pressagio/callback.py | 37 + uberwriter_lib/pressagio/character.py | 34 + uberwriter_lib/pressagio/combiner.py | 64 ++ uberwriter_lib/pressagio/context_tracker.py | 177 +++++ uberwriter_lib/pressagio/dbconnector.py | 745 ++++++++++++++++++ uberwriter_lib/pressagio/observer.py | 71 ++ uberwriter_lib/pressagio/predictor.py | 425 ++++++++++ uberwriter_lib/pressagio/tests/__init__.py | 8 + .../pressagio/tests/test_character.py | 27 + .../pressagio/tests/test_combiner.py | 74 ++ .../tests/test_data/der_linksdenker.txt | 28 + .../tests/test_data/profile_smoothedngram.ini | 26 + .../pressagio/tests/test_dbconnector.py | 253 ++++++ .../pressagio/tests/test_predictor.py | 143 ++++ .../pressagio/tests/test_tokenizer.py | 91 +++ uberwriter_lib/pressagio/tokenizer.py | 289 +++++++ 20 files changed, 2729 insertions(+) create mode 100644 uberwriter/UberwriterAutoCorrect.py create mode 100644 uberwriter_lib/pressagio/VERSION create mode 100644 uberwriter_lib/pressagio/__init__.py create mode 100644 uberwriter_lib/pressagio/callback.py create mode 100644 uberwriter_lib/pressagio/character.py create mode 100644 uberwriter_lib/pressagio/combiner.py create mode 100644 uberwriter_lib/pressagio/context_tracker.py create mode 100644 uberwriter_lib/pressagio/dbconnector.py create mode 100644 uberwriter_lib/pressagio/observer.py create mode 100644 uberwriter_lib/pressagio/predictor.py create mode 100644 uberwriter_lib/pressagio/tests/__init__.py create mode 100644 uberwriter_lib/pressagio/tests/test_character.py create mode 100644 uberwriter_lib/pressagio/tests/test_combiner.py create mode 100644 uberwriter_lib/pressagio/tests/test_data/der_linksdenker.txt create mode 100644 uberwriter_lib/pressagio/tests/test_data/profile_smoothedngram.ini create mode 100644 uberwriter_lib/pressagio/tests/test_dbconnector.py create mode 100644 uberwriter_lib/pressagio/tests/test_predictor.py create mode 100644 uberwriter_lib/pressagio/tests/test_tokenizer.py create mode 100644 uberwriter_lib/pressagio/tokenizer.py diff --git a/.gitignore b/.gitignore index 48c1cbd..5250903 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ build/lib.linux-x86_64-2.7 +*.pyc __pycache__/ build/scripts-2.7 build/share diff --git a/uberwriter/UberwriterAutoCorrect.py b/uberwriter/UberwriterAutoCorrect.py new file mode 100644 index 0000000..4a57460 --- /dev/null +++ b/uberwriter/UberwriterAutoCorrect.py @@ -0,0 +1,201 @@ +# UberwriterAutoCorrect +# The Uberwriter Auto Correct is a auto correction +# mechanism to prevent stupid typos +# import presage +from gi.repository import Gtk, Gdk + +import uberwriter_lib.pressagio as pressagio +import enchant + + +# d = enchant.Dict("de_DE") +import re + +import uberwriter_lib.pressagio.predictor +import uberwriter_lib.pressagio.tokenizer +import uberwriter_lib.pressagio.dbconnector +import uberwriter_lib.pressagio.context_tracker +import uberwriter_lib.pressagio.callback + +import xml.etree.ElementTree as ET +import pickle + +from Levenshtein import distance + +import configparser +from uberwriter_lib.helpers import get_media_path + +# Define and create PresageCallback object + +class PressagioCallback(pressagio.callback.Callback): + def __init__(self, buffer): + super().__init__() + self.buffer = buffer + + def past_stream(self): + return self.buffer + + def future_stream(self): + return '' + +class UberwriterAutoCorrect: + def show_bubble(self, iter, suggestion): + self.suggestion = suggestion + if self.bubble: + self.bubble_label.set_text(suggestion) + else: + pos = self.TextView.get_iter_location(iter) + pos_adjusted = self.TextView.buffer_to_window_coords(Gtk.TextWindowType.TEXT, pos.x, pos.y + pos.height) + self.bubble = Gtk.Grid.new() + self.bubble.set_name("AutoCorrect") + self.TextView.add_child_in_window(self.bubble, Gtk.TextWindowType.TEXT, pos_adjusted[0], pos_adjusted[1]) + self.bubble_label = Gtk.Label.new(suggestion) + self.bubble.attach(self.bubble_label, 0, 0, 1, 1) + close = Gtk.Image.new_from_icon_name('dialog-close', Gtk.IconSize.SMALL_TOOLBAR) + self.bubble.attach(close, 1, 0, 1, 1) + self.bubble.show_all() + + def suggest(self, stump, context): + if self.enchant_dict.check(stump): + self.destroy_bubble() + return + + self.callback.buffer = ' '.join(context) + ' ' + stump + self.callback.buffer = self.callback.buffer.lstrip().rstrip() + predictions = [] + if self.use_pressagio: + predictions = self.prsgio.predict(6, None) + prediction = None + if not len(predictions): + if self.enchant_dict.check(stump): + self.destroy_bubble() + return + predictions = self.enchant_dict.suggest(stump) + suggestions_map = [] + for suggestion in predictions: + if suggestion in self.frequency_dict: + suggestions_map.append({'suggestion': suggestion, 'freq': self.frequency_dict[suggestion]}) + else: + suggestions_map.append({'suggestion': suggestion, 'freq': 0}) + + suggestions_map.sort(key= lambda x: x['freq']) + suggestions_map.reverse() + prediction = suggestions_map[0] + print(predictions) + prediction = predictions[0] + else: + prediction = predictions[0].word + anchor_iter = self.buffer.get_iter_at_mark(self.buffer.get_insert()) + anchor_iter.backward_visible_word_start() + if len(stump) >= 1: + self.show_bubble(anchor_iter, prediction) + + def destroy_bubble(self, *args): + if not self.bubble: + return + self.bubble.destroy() + self.bubble = None + self.suggestion = '' + + def get_frequency_dict(self, language): + self.frequency_dict = {} + pp_pickled = get_media_path("frequency_dict_" + self.language + ".pickle") + if pp_pickled and os.path.isfile(pp_pickled): + f = open(pp_pickled, 'rb') + self.frequency_dict = pickle.load(f) + f.close() + else: + pp = get_media_path('wordlists/en_us_wordlist.xml') + frequencies = ET.parse(pp) + root = frequencies.getroot() + for child in root: + self.frequency_dict[child.text] = int(child.attrib['f']) + f = open('pickled_dict', 'wb+') + pickle.dump(self.frequency_dict, f) + f.close() + + def accept_suggestion(self, append=""): + print("called") + curr_iter = self.buffer.get_iter_at_mark(self.buffer.get_insert()) + start_iter = curr_iter.copy() + start_iter.backward_visible_word_start() + self.buffer.delete(start_iter, curr_iter) + self.buffer.insert_at_cursor(self.suggestion + append) + self.destroy_bubble() + + def key_pressed(self, widget, event): + if not self.bubble: + return False + if event.keyval in [Gdk.KEY_Escape, Gdk.KEY_BackSpace]: + self.destroy_bubble() + return False + + def text_insert(self, buffer, location, + text, len, data=None): + # check if at end of a word + # if yes, check if suggestion available + # then display suggetion + if self.suggestion and text in [' ', '\t', '\n', '.', '?', '!', ',', ';', '\'', '"', ')', ':']: + self.accept_suggestion(append=text) + location.assign(self.buffer.get_iter_at_mark(self.buffer.get_insert())) + elif location.ends_word(): + iter_start = location.copy() + iter_start.backward_visible_word_starts(3) + text = buffer.get_text(iter_start, location, False) + words = text.split() + self.suggest(words[-1], words[0:-1]) + + def disable(self): + self.disabled = True + + def enable(self): + self.disabled = False + + def set_language(self, language): + print("Language changed to: %s" % language) + + # handle 2 char cases e.g. "en" + if(len(language) == 2): + if "en": + language = "en_US" + + if self.language == language: + return + + else: + self.language = language + print("Language changing") + config_file = get_media_path("pressagio_config.ini") + pres_config = configparser.ConfigParser() + pres_config.read(config_file) + pres_config.set("Database", "database", get_media_path("corpora/" + self.language + ".sqlite")) + self.context_tracker = pressagio.context_tracker.ContextTracker( + pres_config, self.predictor_registry, self.callback) + self.prsgio = self.predictor_registry[0] + + self.enchant_dict = enchant.Dict(self.language) + + def __init__(self, textview, textbuffer): + self.TextView = textview + self.buffer = textbuffer + self.suggestion = "" + self.bubble = self.bubble_label = None + self.buffer.connect_after('insert-text', self.text_insert) + self.TextView.connect('key-press-event', self.key_pressed) + + self.language = "en_US" + self.frequency_dict = {} + self.get_frequency_dict(self.language) + self.enchant_dict = enchant.Dict(self.language) + + self.use_pressagio = False + config_file = get_media_path("pressagio_config.ini") + pres_config = configparser.ConfigParser() + pres_config.read(config_file) + pres_config.set("Database", "database", get_media_path("corpora/" + self.language + ".sqlite")) + self.callback = PressagioCallback("") + + self.predictor_registry = pressagio.predictor.PredictorRegistry(pres_config) + self.context_tracker = pressagio.context_tracker.ContextTracker( + pres_config, self.predictor_registry, self.callback) + self.prsgio = self.predictor_registry[0] \ No newline at end of file diff --git a/uberwriter_lib/pressagio/VERSION b/uberwriter_lib/pressagio/VERSION new file mode 100644 index 0000000..7693c96 --- /dev/null +++ b/uberwriter_lib/pressagio/VERSION @@ -0,0 +1 @@ +0.1.3 \ No newline at end of file diff --git a/uberwriter_lib/pressagio/__init__.py b/uberwriter_lib/pressagio/__init__.py new file mode 100644 index 0000000..463ccf8 --- /dev/null +++ b/uberwriter_lib/pressagio/__init__.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- +# +# Poio Tools for Linguists +# +# Copyright (C) 2009-2013 Poio Project +# Author: Peter Bouda +# URL: +# For license information, see LICENSE + +from . import predictor +from . import context_tracker + +class Pressagio: + + def __init__(self, callback, config, dbconnection = None): + self.config = config + self.callback = callback + + self.predictor_registry = pressagio.predictor.PredictorRegistry( + self.config, dbconnection) + self.context_tracker = pressagio.context_tracker.ContextTracker( + self.config, self.predictor_registry, callback) + + self.predictor_activator = pressagio.predictor.PredictorActivator( + self.config, self.predictor_registry, self.context_tracker) + self.predictor_activator.combination_policy = "meritocracy" + + def predict(self): + multiplier = 1 + predictions = self.predictor_activator.predict(multiplier) + return [p.word for p in predictions] + + def close_database(self): + self.predictor_registry.close_database() \ No newline at end of file diff --git a/uberwriter_lib/pressagio/callback.py b/uberwriter_lib/pressagio/callback.py new file mode 100644 index 0000000..5216316 --- /dev/null +++ b/uberwriter_lib/pressagio/callback.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +# +# Poio Tools for Linguists +# +# Copyright (C) 2009-2013 Poio Project +# Author: Peter Bouda +# URL: +# For license information, see LICENSE + +""" +Base class for callbacks. + +""" + +from __future__ import absolute_import, unicode_literals + +class Callback(object): + """ + Base class for callbacks. + + """ + + def __init__(self): + self.stream = "" + self.empty = "" + + def past_stream(self): + return self.stream + + def future_stream(self): + return self.empty + + def update(self, character): + if character == "\b" and len(stream) > 0: + self.stream[:-1] + else: + self.stream += character diff --git a/uberwriter_lib/pressagio/character.py b/uberwriter_lib/pressagio/character.py new file mode 100644 index 0000000..93beb96 --- /dev/null +++ b/uberwriter_lib/pressagio/character.py @@ -0,0 +1,34 @@ +# -*- coding: utf-8 -*- +# +# Poio Tools for Linguists +# +# Copyright (C) 2009-2013 Poio Project +# Author: Peter Bouda +# URL: +# For license information, see LICENSE + +from __future__ import absolute_import, unicode_literals + +import unicodedata + +blankspaces = " \f\n\r\t\v…" +separators = "`~!@#$%^&*()_-+=\\|]}[{'\";:/?.>,<†„“।॥ו–´’‘‚י0123456789ः" + +def first_word_character(string): + for i, ch in enumerate(string): + if is_word_character(ch): + return i + + return -1 + +def last_word_character(string): + result = first_word_character(string[::-1]) + if result == -1: + return -1 + return len(string) - result - 1 + +def is_word_character(char): + # check for letter category + if unicodedata.category(char)[0] == "L": + return True + return False diff --git a/uberwriter_lib/pressagio/combiner.py b/uberwriter_lib/pressagio/combiner.py new file mode 100644 index 0000000..fe8940b --- /dev/null +++ b/uberwriter_lib/pressagio/combiner.py @@ -0,0 +1,64 @@ +# -*- coding: utf-8 -*- +# +# Poio Tools for Linguists +# +# Copyright (C) 2009-2013 Poio Project +# Author: Peter Bouda +# URL: +# For license information, see LICENSE + +""" +Combiner classes to merge results from several predictors. + +""" + +from __future__ import absolute_import, unicode_literals + +import abc + +from . import predictor + +class Combiner(object): + """ + Base class for all combiners + """ + + __metaclass__ = abc.ABCMeta + + def __init__(self): + pass + + def filter(self, prediction): + seen_tokens = set() + result = predictor.Prediction() + for i, suggestion in enumerate(prediction): + token = suggestion.word + if token not in seen_tokens: + for j in range(i+1, len(prediction)): + if token == prediction[j].word: + # TODO: interpolate here? + suggestion.probability += prediction[j].probability + if suggestion.probability > \ + predictor.MAX_PROBABILITY: + suggestion.probability = \ + MAX_PROBABILITY + seen_tokens.add(token) + result.add_suggestion(suggestion) + return result + + @abc.abstractmethod + def combine(self): + raise NotImplementedError("Method must be implemented") + + +class MeritocracyCombiner(Combiner): + + def __init__(self): + pass + + def combine(self, predictions): + result = predictor.Prediction() + for prediction in predictions: + for suggestion in prediction: + result.add_suggestion(suggestion) + return(self.filter(result)) diff --git a/uberwriter_lib/pressagio/context_tracker.py b/uberwriter_lib/pressagio/context_tracker.py new file mode 100644 index 0000000..4fb07b3 --- /dev/null +++ b/uberwriter_lib/pressagio/context_tracker.py @@ -0,0 +1,177 @@ +# -*- coding: utf-8 -*- +# +# Poio Tools for Linguists +# +# Copyright (C) 2009-2013 Poio Project +# Author: Peter Bouda +# URL: +# For license information, see LICENSE + +""" +Class for context tracker. + +""" + +from __future__ import absolute_import, unicode_literals + +import copy +import io + +from . import character +from . import observer +from . import tokenizer + +DEFAULT_SLIDING_WINDOW_SIZE = 80 + +class InvalidCallbackException(Exception): pass + +class ContextChangeDetector(object): + + def __init__(self, lowercase): + self.lowercase = lowercase + self.sliding_windows_size = DEFAULT_SLIDING_WINDOW_SIZE + self.sliding_window = "" + + def update_sliding_window(self, string): + if len(string) <= self.sliding_windows_size: + self.sliding_window = string + else: + self.sliding_window = string[:-self.sliding_windows_size] + + def context_change(self, past_stream): + # rename for clarity + prev_context = self.sliding_window + curr_context = past_stream + + if len(prev_context) == 0: + if len(curr_context) == 0: + return False + else: + return True + + ctx_idx = curr_context.rfind(prev_context) + if ctx_idx == -1: + return True + + remainder = curr_context[ctx_idx + len(prev_context):] + idx = character.last_word_character(remainder) + if idx == -1: + if len(remainder) == 0: + return False + last_char = curr_context[ctx_idx + len(prev_context) - 1] + if character.is_word_character(last_char): + return False + else: + return True + + if idx == len(remainder) - 1: + return False + + return True + + def change(self, past_stream): + # rename for clarity + prev_context = self.sliding_window + curr_context = past_stream + + if len(prev_context) == 0: + return past_stream + + ctx_idx = curr_context.rfind(prev_context) + if ctx_idx == -1: + return past_stream + + result = curr_context[ctx_idx + len(prev_context):] + if (self.context_change(past_stream)): + sliding_window_stream = self.sliding_window + r_tok = tokenizer.ReverseTokenizer(sliding_window_stream) + r_tok.lowercase = self.lowercase + first_token = r_tok.next_token() + if not len(first_token) == 0: + result = first_token + result + + return result + +class ContextTracker(object): #observer.Observer + """ + Tracks the current context. + + """ + + def __init__(self, config, predictor_registry, callback): + #self.dispatcher = observer.Dispatcher(self) + self.config = config + self.lowercase = self.config.getboolean("ContextTracker", "lowercase_mode") + + self.registry = predictor_registry + if callback: + self.callback = callback + else: + raise InvalidCallbackException + + self.context_change_detector = ContextChangeDetector(self.lowercase) + self.registry.context_tracker = self + + self.sliding_windows_size = DEFAULT_SLIDING_WINDOW_SIZE + + def context_change(self): + return self.context_change_detector.context_change(self.past_stream()) + + def update_context(self): + change = self.context_change_detector.change(self.past_stream()) + tok = tokenizer.ForwardTokenizer(change) + tok.lowercase = self.lowercase + + change_tokens = [] + while(tok.has_more_tokens()): + token = tok.next_token() + change_tokens.append(token) + + if len(change_tokens) != 0: + # remove prefix (partially entered token or empty token) + change_tokens.pop() + + for predictor in self.predictor_registry: + predictor.learn(change_tokens) + + self.context_change_detector.update_sliding_window(self.past_stream()) + + def prefix(self): + self.token(0) + + def token(self, index): + past_string_stream = self.past_stream() + string_io = io.StringIO(past_string_stream) + tok = tokenizer.ReverseTokenizer(string_io) + tok.lowercase = self.lowercase + i = 0 + while tok.has_more_tokens() and i <= index: + token = tok.next_token() + i += 1 + if i <= index: + token = "" + + return token + + def extra_token_to_learn(self, index, change): + return self.token(index + len(change)) + + def future_stream(self): + return self.callback.future_stream() + + def past_stream(self): + return self.callback.past_stream() + + def is_completion_valid(self, completion): + prefix = self.prefix().lower() + if prefix in completion: + return True + return False + + def __repr__(self): + return self.callback.past_stream + "<|>" + self.callback.future_stream \ + + "\n" + +# def update(self, observable): +# self.dispatcher.dispatch(observable) + diff --git a/uberwriter_lib/pressagio/dbconnector.py b/uberwriter_lib/pressagio/dbconnector.py new file mode 100644 index 0000000..8fb5465 --- /dev/null +++ b/uberwriter_lib/pressagio/dbconnector.py @@ -0,0 +1,745 @@ +# -*- coding: utf-8 -*- +# +# Poio Tools for Linguists +# +# Copyright (C) 2001-2013 Poio Project +# Author: Peter Bouda +# URL: +# For license information, see LICENSE + +""" +Classes to connect to databases. + +""" + +from __future__ import absolute_import, unicode_literals + +import abc +import sqlite3 +import time +import re +import regex + +try: + import psycopg2 + psycopg2.extensions.register_type(psycopg2.extensions.UNICODE) + psycopg2.extensions.register_type(psycopg2.extensions.UNICODEARRAY) +except ImportError: + pass + +re_escape_singlequote = re.compile("'") + + +def _sqlite3_regex(expr, item): + return (not (not regex.search(expr, item))) + + +class DatabaseConnector(object): + """ + Base class for all database connectors. + + """ + + __metaclass__ = abc.ABCMeta + + def __init__(self, dbname, cardinality = 1): + """ + Constructor of the base class DababaseConnector. + + Parameters + ---------- + dbname : str + path to the database file or database name + cardinality : int + default cardinality for n-grams + + """ + print("asjdas jdlkasj ljsa kdj lsakdj lk") + self.cardinality = cardinality + self.dbname = dbname + self.lowercase = False + self.normalize = False + + def create_ngram_table(self, cardinality): + """ + Creates a table for n-gram of a give cardinality. The table name is + constructed from this parameter, for example for cardinality `2` there + will be a table `_2_gram` created. + + Parameters + ---------- + cardinality : int + The cardinality to create a table for. + + """ + query = "CREATE TABLE IF NOT EXISTS _{0}_gram (".format(cardinality) + unique = "" + for i in reversed(range(cardinality)): + if i != 0: + unique += "word_{0}, ".format(i) + query += "word_{0} TEXT, ".format(i) + else: + unique += "word" + query += "word TEXT, count INTEGER, UNIQUE({0}) );".format( + unique) + + self.execute_sql(query) + + def delete_ngram_table(self, cardinality): + """ + Deletes the table for n-gram of a give cardinality. The table name is + constructed from this parameter, for example for cardinality `2` there + will be a table `_2_gram` deleted. + + Parameters + ---------- + cardinality : int + The cardinality of the table to delete. + + """ + + query = "DROP TABLE IF EXISTS _{0}_gram;".format(cardinality) + self.execute_sql(query) + + def create_index(self, cardinality): + """ + Create an index for the table with the given cardinality. + + Parameters + ---------- + cardinality : int + The cardinality to create a index for. + + """ + for i in reversed(range(cardinality)): + if i != 0: + query = "CREATE INDEX idx_{0}_gram_{1} ON _{0}_gram(word_{1});".format(cardinality, i) + self.execute_sql(query) + + def delete_index(self, cardinality): + """ + Delete index for the table with the given cardinality. + + Parameters + ---------- + cardinality : int + The cardinality of the index to delete. + + """ + for i in reversed(range(cardinality)): + if i != 0: + query = "DROP INDEX IF EXISTS idx_{0}_gram_{1};".format( + cardinality, i) + self.execute_sql(query) + + def create_unigram_table(self): + """ + Creates a table for n-grams of cardinality 1. + + """ + self.create_ngram_table(1) + + def create_bigram_table(self): + """ + Creates a table for n-grams of cardinality 2. + + """ + self.create_ngram_table(2) + + def create_trigram_table(self): + """ + Creates a table for n-grams of cardinality 3. + + """ + self.create_ngram_table(3) + + + def ngrams(self, with_counts=False): + """ + Returns all ngrams that are in the table. + + Parameters + ---------- + None + + Returns + ------- + ngrams : generator + A generator for ngram tuples. + + """ + query = "SELECT " + for i in reversed(range(self.cardinality)): + if i != 0: + query += "word_{0}, ".format(i) + elif i == 0: + query += "word" + + if with_counts: + query += ", count" + + query += " FROM _{0}_gram;".format(self.cardinality) + print(query) + result = self.execute_sql(query) + for row in result: + yield tuple(row) + + def unigram_counts_sum(self): + query = "SELECT SUM(count) from _1_gram;" + result = self.execute_sql(query) + print(result, query) + return self._extract_first_integer(result) + + def ngram_count(self, ngram): + """ + Gets the count for a given ngram from the database. + + Parameters + ---------- + ngram : iterable of str + A list, set or tuple of strings. + + Returns + ------- + count : int + The count of the ngram. + + """ + query = "SELECT count FROM _{0}_gram".format(len(ngram)) + query += self._build_where_clause(ngram) + query += ";" + + result = self.execute_sql(query) + + return self._extract_first_integer(result) + + def ngram_like_table(self, ngram, limit = -1): + print("NGRAM LIKE TABLE!\n\n\n") + query = "SELECT {0} FROM _{1}_gram {2} ORDER BY count DESC".format( + self._build_select_like_clause(len(ngram)), len(ngram), + self._build_where_like_clause(ngram)) + print(query) + if limit < 0: + query += ";" + else: + query += " LIMIT {0};".format(limit) + + return self.execute_sql(query) + + def ngram_like_table_filtered(self, ngram, filter, limit = -1): + pass + + def increment_ngram_count(self, ngram): + pass + + def insert_ngram(self, ngram, count): + """ + Inserts a given n-gram with count into the database. + + Parameters + ---------- + ngram : iterable of str + A list, set or tuple of strings. + count : int + The count for the given n-gram. + + """ + query = "INSERT INTO _{0}_gram {1};".format(len(ngram), + self._build_values_clause(ngram, count)) + self.execute_sql(query) + + def update_ngram(self, ngram, count): + """ + Updates a given ngram in the database. The ngram has to be in the + database, otherwise this method will stop with an error. + + Parameters + ---------- + ngram : iterable of str + A list, set or tuple of strings. + count : int + The count for the given n-gram. + + """ + query = "UPDATE _{0}_gram SET count = {1}".format(len(ngram), count) + query += self._build_where_clause(ngram) + query += ";" + self.execute_sql(query) + + def remove_ngram(self, ngram): + """ + Removes a given ngram from the databae. The ngram has to be in the + database, otherwise this method will stop with an error. + + Parameters + ---------- + ngram : iterable of str + A list, set or tuple of strings. + + """ + query = "DELETE FROM _{0}_gram".format(len(ngram)) + query += self._build_where_clause(ngram) + query += ";" + self.execute_sql(query) + + def open_database(self): + raise NotImplementedError("Method must be implemented") + + def close_database(self): + raise NotImplementedError("Method must be implemented") + + def execute_sql(self): + raise NotImplementedError("Method must be implemented") + + ############################################### Private methods + + def _build_values_clause(self, ngram, count): + ngram_escaped = [] + for n in ngram: + ngram_escaped.append(re_escape_singlequote.sub("''", n)) + + values_clause = "VALUES('" + values_clause += "', '".join(ngram_escaped) + values_clause += "', {0})".format(count) + return values_clause + + + + def _build_where_clause(self, ngram): + where_clause = " WHERE" + for i in range(len(ngram)): + n = re_escape_singlequote.sub("''", ngram[i]) + if i < (len(ngram) - 1): + where_clause += " word_{0} = '{1}' AND".format( + len(ngram) - i - 1, n) + else: + pattern = '(?:^%s){e<=%d}' % (n, 2) + where_clause += " word = '{0}'".format(n) + print(where_clause) + return where_clause + + def _build_select_like_clause(self, cardinality): + result = "" + for i in reversed(range(cardinality)): + if i != 0: + result += "word_{0}, ". format(i) + else: + result += "word, count" + return result + + def _build_where_like_clause(self, ngram): + where_clause = " WHERE" + for i in range(len(ngram)): + if i < (len(ngram) - 1): + where_clause += " word_{0} = '{1}' AND".format( + len(ngram) - i - 1, ngram[i]) + else: + pattern = '(?:%s){e<=%d}' % (ngram[-1], 0) + where_clause += " (word regexp '%s')" % pattern + return where_clause + + def _extract_first_integer(self, table): + count = 0 + if len(table) > 0: + if len(table[0]) > 0: + count = int(table[0][0]) + + if not count > 0: + count = 0 + return count + + +class SqliteDatabaseConnector(DatabaseConnector): + """ + Database connector for sqlite databases. + + """ + + def __init__(self, dbname, cardinality = 1): + """ + Constructor for the sqlite database connector. + + Parameters + ---------- + dbname : str + path to the database file + cardinality : int + default cardinality for n-grams + + """ + DatabaseConnector.__init__(self, dbname, cardinality) + self.con = None + self.open_database() + + def commit(self): + """ + Sends a commit to the database. + + """ + self.con.commit() + + def open_database(self): + """ + Opens the sqlite database. + + """ + self.con = sqlite3.connect(self.dbname) + self.con.create_function("regexp", 2, _sqlite3_regex) + + + def close_database(self): + """ + Closes the sqlite database. + + """ + if self.con: + self.con.close() + + def execute_sql(self, query): + """ + Executes a given query string on an open sqlite database. + + """ + c = self.con.cursor() + c.execute(query) + result = c.fetchall() + return result + + +class PostgresDatabaseConnector(DatabaseConnector): + """ + Database connector for postgres databases. + + """ + + def __init__(self, dbname, cardinality = 1, host = "localhost", port = 5432, + user = "postgres", password = None, connection = None): + """ + Constructor for the postgres database connector. + + Parameters + ---------- + dbname : str + the database name + cardinality : int + default cardinality for n-grams + host : str + hostname of the postgres database + port : int + port number of the postgres database + user : str + user name for the postgres database + password: str + user password for the postgres database + connection : connection + an open database connection + + """ + DatabaseConnector.__init__(self, dbname, cardinality) + self.con = connection + self.host = host + self.port = port + self.user = user + self.password = password + + def create_database(self): + """ + Creates an empty database if not exists. + + """ + if not self._database_exists(): + con = psycopg2.connect(host=self.host, database="postgres", + user=self.user, password=self.password, port=self.port) + con.set_isolation_level( + psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) + query = "CREATE DATABASE {0};".format(self.dbname) + c = con.cursor() + c.execute(query) + con.close() + + + if self.normalize: + self.open_database() + query = "CREATE EXTENSION IF NOT EXISTS \"plperlu\";" + self.execute_sql(query) + # query = """CREATE OR REPLACE FUNCTION normalize(str text) + #RETURNS text + #AS $$ + #import unicodedata + #return ''.join(c for c in unicodedata.normalize('NFKD', str) + #if unicodedata.category(c) != 'Mn') + #$$ LANGUAGE plpython3u IMMUTABLE;""" + # query = """CREATE OR REPLACE FUNCTION normalize(mystr text) + # RETURNS text + # AS $$ + # from unidecode import unidecode + # return unidecode(mystr.decode("utf-8")) + # $$ LANGUAGE plpythonu IMMUTABLE;""" + query = """CREATE OR REPLACE FUNCTION normalize(text) + RETURNS text + AS $$ + use Text::Unidecode; + return unidecode(shift); + $$ LANGUAGE plperlu IMMUTABLE;""" + self.execute_sql(query) + self.commit() + self.close_database() + + + def reset_database(self): + """ + Re-create an empty database. + + """ + if self._database_exists(): + con = psycopg2.connect(host=self.host, database="postgres", + user=self.user, password=self.password, port=self.port) + con.set_isolation_level( + psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) + query = "DROP DATABASE {0};".format(self.dbname) + c = con.cursor() + c.execute(query) + con.close() + self.create_database() + + def create_index(self, cardinality): + """ + Create an index for the table with the given cardinality. + + Parameters + ---------- + cardinality : int + The cardinality to create a index for. + + """ + DatabaseConnector.create_index(self, cardinality) + query = "CREATE INDEX idx_{0}_gram_varchar ON _{0}_gram(word varchar_pattern_ops);".format(cardinality) + self.execute_sql(query) + + if self.lowercase: + + for i in reversed(range(cardinality)): + if i != 0: + query = "CREATE INDEX idx_{0}_gram_{1}_lower ON _{0}_gram(LOWER(word_{1}));".format(cardinality, i) + self.execute_sql(query) + + if self.normalize: + + query = "CREATE INDEX idx_{0}_gram_lower_normalized_varchar ON _{0}_gram(NORMALIZE(LOWER(word)) varchar_pattern_ops);".format(cardinality) + self.execute_sql(query) + + else: + + query = "CREATE INDEX idx_{0}_gram_lower_varchar ON _{0}_gram(LOWER(word) varchar_pattern_ops);".format(cardinality) + self.execute_sql(query) + + elif self.normalize: + + query = "CREATE INDEX idx_{0}_gram_normalized_varchar ON _{0}_gram(NORMALIZE(word) varchar_pattern_ops);".format(cardinality) + self.execute_sql(query) + + def delete_index(self, cardinality): + """ + Delete index for the table with the given cardinality. + + Parameters + ---------- + cardinality : int + The cardinality of the index to delete. + + """ + DatabaseConnector.delete_index(self, cardinality) + + query = "DROP INDEX IF EXISTS idx_{0}_gram_varchar;".format(cardinality) + self.execute_sql(query) + query = "DROP INDEX IF EXISTS idx_{0}_gram_normalized_varchar;".format( + cardinality) + self.execute_sql(query) + query = "DROP INDEX IF EXISTS idx_{0}_gram_lower_varchar;".format( + cardinality) + self.execute_sql(query) + query = "DROP INDEX IF EXISTS idx_{0}_gram_lower_normalized_varchar;".\ + format(cardinality) + self.execute_sql(query) + for i in reversed(range(cardinality)): + if i != 0: + query = "DROP INDEX IF EXISTS idx_{0}_gram_{1}_lower;".format( + cardinality, i) + self.execute_sql(query) + + def commit(self): + """ + Sends a commit to the database. + + """ + self.con.commit() + + def open_database(self): + """ + Opens the sqlite database. + + """ + if not self.con: + try: + self.con = psycopg2.connect(host=self.host, + database=self.dbname, user=self.user, + password=self.password, port=self.port) + except psycopg2.Error as e: + print("Error while opening database:") + print(e.pgerror) + + def close_database(self): + """ + Closes the sqlite database. + + """ + if self.con: + self.con.close() + self.con = None + + def execute_sql(self, query): + """ + Executes a given query string on an open postgres database. + + """ + c = self.con.cursor() + c.execute(query) + result = [] + if c.rowcount > 0: + try: + result = c.fetchall() + except psycopg2.ProgrammingError: + pass + return result + + + ############################################### Private methods + + def _database_exists(self): + """ + Check if the database exists. + + """ + con = psycopg2.connect(host=self.host, database="postgres", + user=self.user, password=self.password, port=self.port) + query_check = "select datname from pg_catalog.pg_database" + query_check += " where datname = '{0}';".format(self.dbname) + c = con.cursor() + c.execute(query_check) + result = c.fetchall() + if len(result) > 0: + return True + return False + + def _build_where_like_clause(self, ngram): + where_clause = " WHERE" + for i in range(len(ngram)): + if i < (len(ngram) - 1): + if self.lowercase: + where_clause += " LOWER(word_{0}) = LOWER('{1}') AND".format( + len(ngram) - i - 1, ngram[i]) + else: + where_clause += " word_{0} = '{1}' AND".format( + len(ngram) - i - 1, ngram[i]) + else: + if ngram[-1] != "": + if self.lowercase: + if self. normalize: + where_clause += " NORMALIZE(LOWER(word)) LIKE NORMALIZE(LOWER('{0}%'))".format(ngram[-1]) + else: + where_clause += " LOWER(word) LIKE LOWER('{0}%')".format(ngram[-1]) + elif self.normalize: + where_clause += " NORMALIZE(word) LIKE NORMALIZE('{0}%')".format(ngram[-1]) + else: + where_clause += " word LIKE '{0}%'".format(ngram[-1]) + else: + # remove the " AND" + where_clause = where_clause[:-4] + + return where_clause + + +#################################################### Functions + +def insert_ngram_map_sqlite(ngram_map, ngram_size, outfile, append=False, + create_index=False): + sql = SqliteDatabaseConnector(outfile, ngram_size) + sql.create_ngram_table(ngram_size) + + for ngram, count in ngram_map.items(): + if append: + old_count = sql.ngram_count(ngram) + if old_count > 0: + sql.update_ngram(ngram, old_count + count) + else: + sql.insert_ngram(ngram, count) + else: + sql.insert_ngram(ngram, count) + + sql.commit() + + if create_index and not append: + sql.create_index(ngram_size) + + sql.close_database() + + +def insert_ngram_map_postgres(ngram_map, ngram_size, dbname, append=False, + create_index=False, host = "localhost", port = 5432, user = "postgres", + password = None, lowercase = False, normalize = False): + sql = PostgresDatabaseConnector(dbname, ngram_size, host, port, user, + password) + sql.lowercase = lowercase + sql.normalize = normalize + sql.create_database() + sql.open_database() + if not append: + sql.delete_index(ngram_size) + sql.delete_ngram_table(ngram_size) + sql.create_ngram_table(ngram_size) + + for ngram, count in ngram_map.items(): + if append: + old_count = sql.ngram_count(ngram) + if old_count > 0: + sql.update_ngram(ngram, old_count + count) + else: + sql.insert_ngram(ngram, count) + else: + sql.insert_ngram(ngram, count) + + sql.commit() + + if create_index and not append: + sql.create_index(ngram_size) + + sql.commit() + + sql.close_database() + +def _filter_ngrams(sql, dictionary): + for ngram in sql.ngrams(): + delete_ngram = False + for word in ngram: + if not word in dictionary: + delete_ngram = True + if delete_ngram: + sql.remove_ngram(ngram) + + +def filter_ngrams_sqlite(dictionary, ngram_size, outfile): + sql = SqliteDatabaseConnector(outfile, ngram_size) + _filter_ngrams(sql, dictionary) + sql.commit() + sql.close_database() + +def filter_ngrams_postgres(dictionary, ngram_size, dbname, host = "localhost", + port = 5432, user = "postgres", password = None): + sql = PostgresDatabaseConnector(dbname, ngram_size, host, port, user, + password) + sql.open_database() + + _filter_ngrams(sql, dictionary) + + sql.commit() + sql.close_database() \ No newline at end of file diff --git a/uberwriter_lib/pressagio/observer.py b/uberwriter_lib/pressagio/observer.py new file mode 100644 index 0000000..b40ed74 --- /dev/null +++ b/uberwriter_lib/pressagio/observer.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- +# +# Poio Tools for Linguists +# +# Copyright (C) 2009-2013 Poio Project +# Author: Peter Bouda +# URL: +# For license information, see LICENSE + +from __future__ import absolute_import, unicode_literals + +import abc + +class Observer(object): + """ + Base class for classes that want to observer other classes, e.g. the + PredictorActivator. + + """ + + __metaclass__ = abc.ABCMeta + + @abc.abstractmethod + def update(self, observable): + raise NotImplementedError("Method must be implemented") + + +class Oberservable(object): + """ + Base class for everything that needs observation, e.g. the predictors. + + """ + + def __init__(self): + self._observers = [] + + def attach(self, observer): + if not observer in self._observers: + self._observers.append(observer) + + def detach(self, observer): + try: + self._observers.remove(observer) + except ValueError: + pass + + def notify(self, modifier=None): + for observer in self._observers: + if modifier != observer: + observer.update(self) + +class Dispatcher(object): + """ + Dispatches observable notifications. + + """ + + def __init__(self, obj): + self.observables = [] + self.dispatch_dict = {} + self.obj = obj + + def map(self, observable, func): + observable.attach(obj) + self.observables.append(observable) + self.dispatch_dict[observable] = func + self.dispatch(observable) + + def dispatch(self, observable): + handler_func = self.dispatch_dict[observable] + handler_func(observable) diff --git a/uberwriter_lib/pressagio/predictor.py b/uberwriter_lib/pressagio/predictor.py new file mode 100644 index 0000000..d21d1fa --- /dev/null +++ b/uberwriter_lib/pressagio/predictor.py @@ -0,0 +1,425 @@ +# -*- coding: utf-8 -*- +# +# Poio Tools for Linguists +# +# Copyright (C) 2009-2013 Poio Project +# Author: Peter Bouda +# URL: +# For license information, see LICENSE + +""" +Classes for predictors and to handle suggestions and predictions. + +""" + +from __future__ import absolute_import, unicode_literals + +import os +try: + import configparser +except ImportError: + import ConfigParser as configparser + + +from . import dbconnector + +#import pressagio.observer + +MIN_PROBABILITY = 0.0 +MAX_PROBABILITY = 1.0 + +class SuggestionException(Exception): pass +class UnknownCombinerException(Exception): pass +class PredictorRegistryException(Exception): pass + +class Suggestion(object): + """ + Class for a simple suggestion, consists of a string and a probility for that + string. + + """ + + def __init__(self, word, probability): + print("I am a suggetsion") + self.word = word + self._probability = probability + + def __eq__(self, other): + if self.word == other.word and self.probability == other.probability: + return True + return False + + def __lt__(self, other): + if self.probability < other.probability: + return True + if self.probability == other.probability: + return self.word < other.word + return False + + def __repr__(self): + return "Word: {0} - Probability: {1}".format( + self.word, self.probability) + + def probability(): + doc = "The probability property." + def fget(self): + return self._probability + def fset(self, value): + if value < MIN_PROBABILITY or value > MAX_PROBABILITY: + raise SuggestionException("Probability is too high or too low.") + self._probability = value + def fdel(self): + del self._probability + return locals() + probability = property(**probability()) + + +class Prediction(list): + """ + Class for predictions from predictors. + + """ + + def __init__(self): + pass + + def __eq__(self, other): + if self is other: + return True + if len(self) != len(other): + return False + for i, s in enumerate(other): + if not s == self[i]: + return False + return True + + def suggestion_for_token(self, token): + for s in self: + if s.word == token: + return s + + def add_suggestion(self, suggestion): + if len(self) == 0: + self.append(suggestion) + else: + i = 0 + while i < len(self) and suggestion < self[i]: + i += 1 + + self.insert(i, suggestion) + + +class PredictorActivator(object): + """ + PredictorActivator starts the execution of the active predictors, + monitors their execution and collects the predictions returned, or + terminates a predictor's execution if it execedes its maximum + prediction time. + + The predictions returned by the individual predictors are combined + into a single prediction by the active Combiner. + + """ + + def __init__(self, config, registry, context_tracker): + self.config = config + self.registry = registry + self.context_tracker = context_tracker + #self.dispatcher = pressagio.observer.Dispatcher(self) + self.predictions = [] + + self.combiner = None + self.max_partial_prediction_size = int(config.get( + "Selector", "suggestions")) + self.predict_time = None + self._combination_policy = None + + def combination_policy(): + doc = "The combination_policy property." + def fget(self): + return self._combination_policy + def fset(self, value): + self._combination_policy = value + if value.lower() == "meritocracy": + self.combiner = pressagio.combiner.MeritocracyCombiner() + else: + raise UnknownCombinerException() + def fdel(self): + del self._combination_policy + return locals() + combination_policy = property(**combination_policy()) + + def predict(self, multiplier = 1, prediction_filter = None): + self.predictions[:] = [] + for predictor in self.registry: + self.predictions.append(predictor.predict( + self.max_partial_prediction_size * multiplier, + prediction_filter)) + result = self.combiner.combine(self.predictions) + return result + + +class PredictorRegistry(list): #pressagio.observer.Observer, + """ + Manages instantiation and iteration through predictors and aids in + generating predictions and learning. + + PredictorRegitry class holds the active predictors and provides the + interface required to obtain an iterator to the predictors. + + The standard use case is: Predictor obtains an iterator from + PredictorRegistry and invokes the predict() or learn() method on each + Predictor pointed to by the iterator. + + Predictor registry should eventually just be a simple wrapper around + plump. + + """ + + def __init__(self, config, dbconnection = None): + self.config = config + self.dbconnection = dbconnection + self._context_tracker = None + self.set_predictors() + + def context_tracker(): + doc = "The context_tracker property." + def fget(self): + return self._context_tracker + def fset(self, value): + if self._context_tracker is not value: + self._context_tracker = value + self[:] = [] + self.set_predictors() + def fdel(self): + del self._context_tracker + return locals() + context_tracker = property(**context_tracker()) + + def set_predictors(self): + if (self.context_tracker): + self[:] = [] + for predictor in self.config.get("PredictorRegistry", "predictors")\ + .split(): + self.add_predictor(predictor) + + def add_predictor(self, predictor_name): + predictor = None + if self.config.get(predictor_name, "predictor_class") == \ + "SmoothedNgramPredictor": + predictor = SmoothedNgramPredictor(self.config, + self.context_tracker, predictor_name, + dbconnection = self.dbconnection) + + if predictor: + self.append(predictor) + + def close_database(self): + for predictor in self: + predictor.close_database() + + +class Predictor(object): + """ + Base class for predictors. + + """ + + def __init__(self, config, context_tracker, predictor_name, + short_desc = None, long_desc = None): + self.short_description = short_desc + self.long_description = long_desc + self.context_tracker = context_tracker + self.name = predictor_name + self.config = config + + def token_satifies_filter(token, prefix, token_filter): + if token_filter: + for char in token_filter: + candidate = prefix + char + if token.startswith(candidate): + return True + return False + +class SmoothedNgramPredictor(Predictor): #, pressagio.observer.Observer + """ + Calculates prediction from n-gram model in sqlite database. You have to + create a database with the script `text2ngram` first. + + """ + + def __init__(self, config, context_tracker, predictor_name, + short_desc = None, long_desc = None, dbconnection = None): + Predictor.__init__(self, config, context_tracker, predictor_name, + short_desc, long_desc) + self.db = None + self.dbconnection = dbconnection + self.cardinality = None + self.learn_mode_set = False + + self.dbclass = None + self.dbuser = None + self.dbpass = None + self.dbhost = None + self.dbport = None + + self._database = None + self._deltas = None + self._learn_mode = None + self.config = config + self.name = predictor_name + self.context_tracker = context_tracker + self._read_config() + + ################################################## Properties + + def deltas(): + doc = "The deltas property." + def fget(self): + return self._deltas + def fset(self, value): + self._deltas = [] + # make sure that values are floats + for i, d in enumerate(value): + self._deltas.append(float(d)) + self.cardinality = len(value) + self.init_database_connector_if_ready() + def fdel(self): + del self._deltas + return locals() + deltas = property(**deltas()) + + def learn_mode(): + doc = "The learn_mode property." + def fget(self): + return self._learn_mode + def fset(self, value): + self._learn_mode = value + self.learn_mode_set = True + self.init_database_connector_if_ready() + def fdel(self): + del self._learn_mode + return locals() + learn_mode = property(**learn_mode()) + + def database(): + doc = "The database property." + def fget(self): + return self._database + + def fset(self, value): + self._database = value + + self.dbclass = self.config.get("Database", "class") + if self.dbclass == "PostgresDatabaseConnector": + self.dbuser = self.config.get("Database", "user") + self.dbpass = self.config.get("Database", "password") + self.dbhost = self.config.get("Database", "host") + self.dbport = self.config.get("Database", "port") + self.dblowercase = self.config.getboolean("Database", + "lowercase_mode") + self.dbnormalize = self.config.getboolean("Database", + "normalize_mode") + + self.init_database_connector_if_ready() + + def fdel(self): + del self._database + return locals() + database = property(**database()) + + #################################################### Methods + + def init_database_connector_if_ready(self): + if self.database and len(self.database) > 0 and \ + self.cardinality and self.cardinality > 0 and \ + self.learn_mode_set: + if self.dbclass == "SqliteDatabaseConnector": + self.db = dbconnector.SqliteDatabaseConnector( + self.database, self.cardinality) #, self.learn_mode + elif self.dbclass == "PostgresDatabaseConnector": + self.db = dbconnector.PostgresDatabaseConnector( + self.database, self.cardinality, self.dbhost, self.dbport, + self.dbuser, self.dbpass, self.dbconnection) + self.db.lowercase = self.dblowercase + self.db.normalize = self.dbnormalize + self.db.open_database() + + def ngram_to_string(self, ngram): + "|".join(ngram) + + def predict(self, max_partial_prediction_size, filter): + print("SmoothedNgramPredictor Predicting") + print(filter) + tokens = [""] * self.cardinality + prediction = Prediction() + + for i in range(self.cardinality): + tokens[self.cardinality - 1 - i] = self.context_tracker.token(i) + prefix_completion_candidates = [] + for k in reversed(range(self.cardinality)): + if len(prefix_completion_candidates) >= max_partial_prediction_size: + break + prefix_ngram = tokens[(len(tokens) - k - 1):] + partial = None + if not filter: + partial = self.db.ngram_like_table(prefix_ngram, + max_partial_prediction_size - \ + len(prefix_completion_candidates)) + else: + partial = db.ngram_like_table_filtered(prefix_ngram, filter, + max_partial_prediction_size - \ + len(prefix_completion_candidates)) + + print((partial)) + for p in partial: + if len(prefix_completion_candidates) > \ + max_partial_prediction_size: + break + candidate = p[-2] # ??? + if candidate not in prefix_completion_candidates: + prefix_completion_candidates.append(candidate) + + # smoothing + unigram_counts_sum = self.db.unigram_counts_sum() + for j, candidate in enumerate(prefix_completion_candidates): + #if j >= max_partial_prediction_size: + # break + tokens[self.cardinality - 1] = candidate + + probability = 0 + for k in range(self.cardinality): + numerator = self._count(tokens, 0, k + 1) + denominator = unigram_counts_sum + if numerator > 0: + denominator = self._count(tokens, -1, k) + frequency = 0 + if denominator > 0: + frequency = float(numerator) / denominator + probability += self.deltas[k] * frequency + + if probability > 0: + prediction.add_suggestion(Suggestion(tokens[self.cardinality - 1], + probability)) + return(prediction) + + def close_database(self): + self.db.close_database() + +################################################ Private methods + + def _read_config(self): + self.database = self.config.get("Database", "database") + self.deltas = self.config.get(self.name, "deltas").split() + self.learn_mode = self.config.get(self.name, "learn") + + def _count(self, tokens, offset, ngram_size): + result = 0 + if (ngram_size > 0): + ngram = \ + tokens[len(tokens) - ngram_size + offset:\ + len(tokens) + offset] + result = self.db.ngram_count(ngram) + else: + result = self.db.unigram_counts_sum() + return result diff --git a/uberwriter_lib/pressagio/tests/__init__.py b/uberwriter_lib/pressagio/tests/__init__.py new file mode 100644 index 0000000..6fac984 --- /dev/null +++ b/uberwriter_lib/pressagio/tests/__init__.py @@ -0,0 +1,8 @@ +# -*- coding: utf-8 -*- +# +# Poio Tools for Linguists +# +# Copyright (C) 2009-2013 Poio Project +# Author: Peter Bouda +# URL: +# For license information, see LICENSE diff --git a/uberwriter_lib/pressagio/tests/test_character.py b/uberwriter_lib/pressagio/tests/test_character.py new file mode 100644 index 0000000..25b1414 --- /dev/null +++ b/uberwriter_lib/pressagio/tests/test_character.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +# +# Poio Tools for Linguists +# +# Copyright (C) 2009-2013 Poio Project +# Author: Peter Bouda +# URL: +# For license information, see LICENSE + +from __future__ import absolute_import, unicode_literals + +import pressagio.character + +def test_first_word_character(): + assert pressagio.character.first_word_character("8238$§(a)jaj2u2388!") == 7 + assert pressagio.character.first_word_character("123üäö34ashdh") == 3 + assert pressagio.character.first_word_character("123&(/==") == -1 + +def test_last_word_character(): + assert pressagio.character.last_word_character("8238$§(a)jaj2u2388!") == 13 + assert pressagio.character.last_word_character("123üäö34ashdh") == 12 + assert pressagio.character.last_word_character("123&(/==") == -1 + +def test_is_word_character(): + assert pressagio.character.is_word_character("ä") == True + assert pressagio.character.is_word_character("1") == False + assert pressagio.character.is_word_character(".") == False diff --git a/uberwriter_lib/pressagio/tests/test_combiner.py b/uberwriter_lib/pressagio/tests/test_combiner.py new file mode 100644 index 0000000..0ce554a --- /dev/null +++ b/uberwriter_lib/pressagio/tests/test_combiner.py @@ -0,0 +1,74 @@ +# -*- coding: utf-8 -*- +# +# Poio Tools for Linguists +# +# Copyright (C) 2009-2013 Poio Project +# Author: Peter Bouda +# URL: +# For license information, see LICENSE + +from __future__ import absolute_import, unicode_literals + +import pressagio.predictor +import pressagio.combiner + +class TestMeritocracyCombiner: + + def setup(self): + self.combiner = pressagio.combiner.MeritocracyCombiner() + + def _create_prediction(self): + prediction = pressagio.predictor.Prediction() + prediction.add_suggestion(pressagio.predictor.Suggestion( + "Test", 0.3)) + prediction.add_suggestion(pressagio.predictor.Suggestion( + "Test2", 0.3)) + prediction.add_suggestion(pressagio.predictor.Suggestion( + "Test", 0.1)) + prediction.add_suggestion(pressagio.predictor.Suggestion( + "Test3", 0.2)) + return prediction + + def _create_prediction2(self): + prediction = pressagio.predictor.Prediction() + prediction.add_suggestion(pressagio.predictor.Suggestion( + "Test2", 0.3)) + prediction.add_suggestion(pressagio.predictor.Suggestion( + "Test", 0.1)) + prediction.add_suggestion(pressagio.predictor.Suggestion( + "Test3", 0.2)) + return prediction + + def test_filter(self): + result = self.combiner.filter( + self._create_prediction()) + + correct = pressagio.predictor.Prediction() + correct.add_suggestion(pressagio.predictor.Suggestion( + "Test3", 0.2)) + correct.add_suggestion(pressagio.predictor.Suggestion( + "Test2", 0.3)) + correct.add_suggestion(pressagio.predictor.Suggestion( + "Test", 0.4)) + + assert result == correct + + def test_combine(self): + predictions = [ self._create_prediction2() ] + prediction2 = self._create_prediction2() + prediction2.add_suggestion(pressagio.predictor.Suggestion( + "Test4", 0.1)) + predictions.append(prediction2) + result = self.combiner.combine(predictions) + + correct = pressagio.predictor.Prediction() + correct.add_suggestion(pressagio.predictor.Suggestion( + "Test3", 0.4)) + correct.add_suggestion(pressagio.predictor.Suggestion( + "Test2", 0.6)) + correct.add_suggestion(pressagio.predictor.Suggestion( + "Test4", 0.1)) + correct.add_suggestion(pressagio.predictor.Suggestion( + "Test", 0.2)) + + assert result == correct diff --git a/uberwriter_lib/pressagio/tests/test_data/der_linksdenker.txt b/uberwriter_lib/pressagio/tests/test_data/der_linksdenker.txt new file mode 100644 index 0000000..17b50f6 --- /dev/null +++ b/uberwriter_lib/pressagio/tests/test_data/der_linksdenker.txt @@ -0,0 +1,28 @@ +Der Linksdenker von Peter Panter + +"Er ist ein Gespenst und doch ein Münchner." +Alfred Polgar + +Das war ein heiterer Abschied von Berlin: sechs Wochen Panke und ein Abend Karl Valentin – die Rechnung ging ohne Rest auf. + +Ich kam zu spät ins Theater, der Saal war bereits warm und voll Lachen. Es mochte grade begonnen haben, aber die Leute waren animiert und vergnügt wie sonst erst nach dem zweiten Akt. Am Podium der Bühne auf der Bühne, mitten in der Vorstadtkapelle, saß ein Mann mit einer aufgeklebten Perücke, er sah aus, wie man sich sonst wohl einen Provinzkomiker vorstellt: ich blickte angestrengt auf die Szene und wußte beim besten Willen nicht, was es da wohl zu lachen gäbe … Aber die Leute lachten wieder, und der Mann hatte doch gar nichts gesagt ... Und plötzlich schweifte mein Auge ab, vorn in der ersten Reihe saß noch Einer, den hatte ich bisher nicht bemerkt, und das war: ER. + +Ein zaundürrer, langer Geselle, mit langen, spitzen Don-Quichotte-Beinen, mit winkligen, spitzigen Knien, einem Löchlein in der Hose, mit blankem, abgeschabtem Anzug. Sein Löchlein in der Hose – er reibt eifrig daran herum. "Das wird Ihnen nichts nützen!" sagt der gestrenge Orchesterchef. Er, leise vor sich hin: "Mit Benzin wärs scho fort!" Leise sagt er das, leise, wie seine schauspielerischen Mittel. Er ist sanft und zerbrechlich, schillert in allen Farben wie eine Seifenblase; wenn er plötzlich zerplatzte, hätte sich Niemand zu wundern. + +"Fertig!" klopft der Kapellmeister. Eins, zwei, drei – da, einen Sechzehnteltakt zuvor, setzte der dürre Bläser ab und bedeutete dem Kapellmeister mit ernstem Zeigefinger: "’s Krawattl rutscht Ihna heraus!" Aergerlich stopft sich der das Ding hinein. "Fertig!" Eins, zwei, drei … So viel, wie ein Auge Zeit braucht, die Wimper zu heben und zu senken, trennte die Kapelle noch von dem schmetternden Tusch – da setzte der Lange ab und sah um sich. Der Kapellmeister klopfte ab. Was es nun wieder gäbe –? "Ich muß mal husten!" sagte der Lange. Pause. Das Orchester wartet. Aber nun kann er nicht. Eins, zwei, drei – tätärätä! Es geht los. + +Und es beginnt die seltsamste Komik, die wir seit langem auf der Bühne gesehen haben: ein Höllentanz der Vernunft um beide Pole des Irrsinns. Das ist eine kleine Seele, dieser Bläser, mit Verbandsorgan, Tarif, Stammtisch und Kollegenklatsch. Er ist ängstlich auf seinen vereinbarten Verdienst und ein bißchen darüber hinaus auf seinen Vorteil bedacht. "Spielen Sie genau, was da steht," sagt der Kapellmeister, "nicht zu viel und nicht zu wenig!" "Zu viel schon gar nicht!" sagt das Verbandsmitglied. + +Oben auf der Bühne will der Vorhang nicht auseinander. "Geh mal sofort einer zum Tapezierer", sagt der Kapellmeister, "aber sofort, und sag ihm, er soll gelegentlich, wenn er Zeit hat, vorbeikommen." Geschieht. Der Tapezierer scheint sofort Zeit zu haben, denn er kommt mitten in die Sängerin hineingeplatzt. Steigt mit der Leiter auf die Bühne – "Zu jener Zeit, wie liebt ich dich, mein Leben", heult die Sängerin – und packt seine Instrumente aus, klopft, hämmert, macht … Seht doch Valentin! Er ist nicht zu halten. Was gibt es da? Was mag da sein? Er hat die Neugier der kleinen Leute. Immer geigend, denn das ist seine bezahlte Pflicht, richtet er sich hoch, steigt auf den Stuhl, reckt zwei Hälse, den seinen und den der Geige, klettert wieder herunter, schreitet durch das Orchester, nach oben auf die Bühne, steigt dort dem Tapezierer auf seiner Leiter nach, geigt und sieht, arbeitet und guckt, was es da Interessantes gibt … Ich muß lange zurückdenken, um mich zu erinnern, wann in einem Theater so gelacht worden ist. + +Er denkt links. Vor Jahren hat er einmal in München in einem Bierkeller gepredigt: "Vorgestern bin ich mit meiner Großmutter in der Oper ‚Lohengrin‘ gewesen. Gestern nacht hat sie die ganze Oper nochmal geträumt; das wann i gwußt hätt, hätten wir gar nicht erst hingehen brauchen!" + +Aber dieser Schreiber, der sich abends sein Brot durch einen kleinen Nebenverdienst aufbessert, wird plötzlich transparent, durchsichtig, über- und unterirdisch und beginnt zu leuchten. Berühren diese langen Beine noch die Erde? + +Es erhebt sich das schwere Problem, eine Pauke von einem Ende der Bühne nach dem andern zu schaffen. Der Auftrag fällt auf Valentin. "I bin eigentlich a Bläser!" sagt er. Bläser schaffen keine Pauken fort. Aber, na … Laatscht hin. Allein geht es nicht. Sein Kollege soll helfen. Und hier wird die Sache durchaus mondsüchtig. "Schafft die Pauke her!" ruft der Kapellmeister ungeduldig. Der Kollege kneetscht in seinen Bart: "Muß das gleich sein?" Der Kapellmeister: "Bringt die Pauke her!" Valentin: "Der Andre laßt fragen, wann." "Der Andre" – nicht: Peperl oder: Herr Schmidt oder: Kollege Hintermüller, sondern: der Andre. Der Andre wird Schicksal, Moira und nachbarlicher Kosmos. Sie drehen sich eine Weile um die Pauke, schließlich sagt "der Andre", er müsse hier stehen, denn er sei Linkshänder. Linkshänder? Vergessen sind Pauke, Kapellmeister und Theateraufführung – Linkshänder! Und nun, ganz Shakespearisch: "Linkshänder bist? Alles links? Beim Schreiben auch? Beim Essen auch? Beim Schlucken auch? Beim Denken auch?" Und dann triumphierend: "Der Andre sagt, er ist links!" Welche Distanz ist da vom "Andern" – wie diesseits ist man selbst, wie jenseits der Andre, wie verschieden, wie getrennt, wie weitab! Mitmensch? Nebenmensch. + +Sicherlich legen wir hier das Philosophische hinein. Sicherlich hat Valentin theoretisch diese Gedankengänge nicht gehabt. Aber man zeige uns doch erst einmal einen Komiker, ein Gefäß, in das man so etwas hineinlegen kann. Bei Herrn Westermeier käme man nicht auf solche Gedanken. Hier aber erhebt sich zum Schluß eine Unterhaltung über den Zufall, ein Hin und Her, kleine magische Funken, die aus einem merkwürdig konstruierten Gehirn sprühen. Er sei Unter den Linden spaziert, mit dem Nebenmann, da hätten sie von einem Radfahrer gesprochen – und da sei gerade einer des Wegs gekommen. Dies zum Kapitel: Zufall. Der Kapellmeister tobt. Das sei kein Zufall – das sei Unsinn. Da kämen tausend Radfahrer täglich vorbei. "Na ja", sagt Valentin, "aber es ist grad Einer kumma!" Unvorstellbar, wie so etwas ausgedacht, geschrieben, probiert wird. Die Komik der irrealen Potentialsätze, die monströse Zerlegung des Satzes: "Ich sehe, daß er nicht da ist!" (was sich da erhebt, ist überhaupt nicht zu sagen!) – die stille Dummheit dieses Witzes, der irrational ist und die leise Komponente des korrigierenden Menschenverstandes nicht aufweist, zwischendurch trinkt er aus einem Seidel Bier, kaut etwas, das er in der Tasche aufbewahrt hatte, denkt mit dem Zeigefinger und hat seine kleine Privatfreude, wenn sich der Kapellmeister geirrt hat. Eine kleine Seele. Als Hans Reimann einmal eine Rundfrage stellte, was sich Jedermann wünschen würde, wenn ihm eine Fee drei Wünsche freistellte, hat Karl Valentin geantwortet: "1.) Ewige Gesundheit. 2.) Einen Leibarzt." Eine kleine Seele. + +Und ein großer Künstler. Wenn ihn nur nicht die berliner Unternehmer einfangen möchten! Das Geheimnis dieses primitiven Ensembles ist seine kräftige Naivität. Das ist eben so, und wems nicht paßt, der soll nicht zuschauen. Gott behüte, wenn man den zu Duetten und komischen Couplets abrichtete! Mit diesen verdrossenen, verquälten, nervösen Regisseuren und Direktoren auf der Probe, die nicht zuhören und zunächst einmal zu Allem Nein sagen. Mit diesem Drum und Dran von unangenehmen berliner Typen, die vorgeben, zu wissen, was das Publikum will, mit dem sie ihren nicht sehr heitern Kreis identifizieren, mit diesen überarbeiteten und unfrohen Gesellen, die nicht mehr fähig sind, von Herzen über das Einfache zu lachen, "weil es schon dagewesen ist". Sie jedenfalls sind immer schon dagewesen. Karl Valentin aber nur ein Mal, weil er ein seltener, trauriger, unirdischer, maßlos lustiger Komiker ist, der links denkt. + +Quelle: http://de.wikisource.org/wiki/Der_Linksdenker \ No newline at end of file diff --git a/uberwriter_lib/pressagio/tests/test_data/profile_smoothedngram.ini b/uberwriter_lib/pressagio/tests/test_data/profile_smoothedngram.ini new file mode 100644 index 0000000..6370d5f --- /dev/null +++ b/uberwriter_lib/pressagio/tests/test_data/profile_smoothedngram.ini @@ -0,0 +1,26 @@ +# Template for profiles +[Database] +class = SqliteDatabaseConnector +database = c:/Users/Peter/Projects/git-github/pressagio/src/pressagio/tests/test_data/test.db + +[PredictorRegistry] +predictors = DefaultSmoothedNgramPredictor + +[DefaultSmoothedNgramPredictor] +predictor_class = SmoothedNgramPredictor +deltas = 0.01 0.1 0.89 +learn = True + +[ContextTracker] +sliding_window_size = 80 +lowercase_mode = True + +[Selector] +suggestions = 6 +repeat_suggestions = no +greedy_suggestion_threshold = 0 + +[PredictorActivator] +predict_time = 100 +max_partial_prediction_size = 60 +combination_policy = Meritocracy diff --git a/uberwriter_lib/pressagio/tests/test_dbconnector.py b/uberwriter_lib/pressagio/tests/test_dbconnector.py new file mode 100644 index 0000000..5d5d02f --- /dev/null +++ b/uberwriter_lib/pressagio/tests/test_dbconnector.py @@ -0,0 +1,253 @@ +# -*- coding: utf-8 -*- +# +# Poio Tools for Linguists +# +# Copyright (C) 2001-2013 Poio Project +# Author: Peter Bouda +# URL: +# For license information, see LICENSE + +from __future__ import absolute_import, unicode_literals + +import os + +import pressagio.dbconnector + +psycopg2_installed = False +try: + import psycopg2 + psycopg2_installed = True +except ImportError: + pass + +class TestSqliteDatabaseConnector(): + + def setup(self): + self.filename = os.path.abspath(os.path.join(os.path.dirname( __file__ ), + 'test_data', 'test.db')) + self.connector = pressagio.dbconnector.SqliteDatabaseConnector(self.filename) + self.connector.open_database() + + def test_execute_sql(self): + self.connector.execute_sql("CREATE TABLE IF NOT EXISTS test ( c1 TEXT, c2 INTEGER );") + + def test_create_ngram_table(self): + self.connector.create_ngram_table(1) + result = self.connector.execute_sql( + "SELECT name FROM sqlite_master WHERE type='table' AND name='_1_gram';") + assert result == [('_1_gram',)] + self.connector.execute_sql("DROP TABLE _1_gram;") + + self.connector.create_ngram_table(2) + result = self.connector.execute_sql( + "SELECT name FROM sqlite_master WHERE type='table' AND name='_2_gram';") + assert result == [('_2_gram',)] + self.connector.execute_sql("DROP TABLE _2_gram;") + + self.connector.create_ngram_table(3) + result = self.connector.execute_sql( + "SELECT name FROM sqlite_master WHERE type='table' AND name='_3_gram';") + assert result == [('_3_gram',)] + self.connector.execute_sql("DROP TABLE _3_gram;") + + def test_create_index(self): + self.connector.create_ngram_table(2) + self.connector.insert_ngram(('der', 'linksdenker'), 22) + self.connector.create_index(2) + result = self.connector.execute_sql( + "SELECT name FROM sqlite_master WHERE type='index' \ + AND name='idx_2_gram_1';") + assert result == [('idx_2_gram_1',)] + + self.connector.execute_sql("DROP TABLE _2_gram;") + + def test_create_unigram_table(self): + self.connector.create_unigram_table() + result = self.connector.execute_sql( + "SELECT name FROM sqlite_master WHERE type='table' AND name='_1_gram';") + assert result == [('_1_gram',)] + self.connector.execute_sql("DROP TABLE _1_gram;") + + def test_create_bigram_table(self): + self.connector.create_bigram_table() + result = self.connector.execute_sql( + "SELECT name FROM sqlite_master WHERE type='table' AND name='_2_gram';") + assert result == [('_2_gram',)] + self.connector.execute_sql("DROP TABLE _2_gram;") + + def test_create_trigram_table(self): + self.connector.create_trigram_table() + result = self.connector.execute_sql( + "SELECT name FROM sqlite_master WHERE type='table' AND name='_3_gram';") + assert result == [('_3_gram',)] + self.connector.execute_sql("DROP TABLE _3_gram;") + + def test_insert_ngram(self): + self.connector.create_bigram_table() + self.connector.insert_ngram(('der', 'linksdenker'), 22) + result = self.connector.execute_sql("SELECT * FROM _2_gram") + assert result == [('der', 'linksdenker', 22)] + self.connector.execute_sql("DROP TABLE _2_gram;") + + def test_update_ngram(self): + self.connector.create_bigram_table() + + # Insert + self.connector.insert_ngram(('der', 'linksdenker'), 22) + result = self.connector.execute_sql("SELECT * FROM _2_gram") + assert result == [('der', 'linksdenker', 22)] + + # Update + self.connector.update_ngram(('der', 'linksdenker'), 44) + result = self.connector.execute_sql("SELECT * FROM _2_gram") + assert result == [('der', 'linksdenker', 44)] + + self.connector.execute_sql("DROP TABLE _2_gram;") + + def test_ngram_count(self): + self.connector.create_bigram_table() + self.connector.insert_ngram(('der', 'linksdenker'), 22) + result = self.connector.ngram_count(('der', 'linksdenker')) + assert result == 22 + self.connector.execute_sql("DROP TABLE _2_gram;") + + def test_ngram_like_table(self): + self.connector.create_bigram_table() + self.connector.insert_ngram(('der', 'linksdenker'), 22) + self.connector.insert_ngram(('der', 'linksabbieger'), 32) + result = self.connector.ngram_like_table(('der', 'links')) + assert result == [('der', 'linksabbieger', 32), ( + 'der', 'linksdenker', 22)] + self.connector.execute_sql("DROP TABLE _2_gram;") + + def teardown(self): + self.connector.close_database() + if os.path.isfile(self.filename): + os.remove(self.filename) + +if psycopg2_installed: + class TestPostgresDatabaseConnector(): + + def setup(self): + self.connector = pressagio.dbconnector.PostgresDatabaseConnector("test") + self.connector.create_database() + self.connector.open_database() + + def test_create_database(self): + self.connector.create_database() + + def test_create_ngram_table(self): + self.connector.create_ngram_table(1) + result = self.connector.execute_sql( + "SELECT * FROM information_schema.tables WHERE table_name='_1_gram';") + assert len(result) == 1 + self.connector.execute_sql("DROP TABLE _1_gram;") + + self.connector.create_ngram_table(2) + result = self.connector.execute_sql( + "SELECT * FROM information_schema.tables WHERE table_name='_2_gram';") + assert len(result) == 1 + self.connector.execute_sql("DROP TABLE _2_gram;") + + self.connector.create_ngram_table(3) + result = self.connector.execute_sql( + "SELECT * FROM information_schema.tables WHERE table_name='_3_gram';") + assert len(result) == 1 + self.connector.execute_sql("DROP TABLE _3_gram;") + + def test_create_unigram_table(self): + self.connector.create_unigram_table() + result = self.connector.execute_sql( + "SELECT * FROM information_schema.tables WHERE table_name='_1_gram';") + assert len(result) == 1 + self.connector.execute_sql("DROP TABLE _1_gram;") + + def test_create_bigram_table(self): + self.connector.create_bigram_table() + result = self.connector.execute_sql( + "SELECT * FROM information_schema.tables WHERE table_name='_2_gram';") + assert len(result) == 1 + self.connector.execute_sql("DROP TABLE _2_gram;") + + def test_create_trigram_table(self): + self.connector.create_trigram_table() + result = self.connector.execute_sql( + "SELECT * FROM information_schema.tables WHERE table_name='_3_gram';") + assert len(result) == 1 + self.connector.execute_sql("DROP TABLE _3_gram;") + + def test_insert_ngram(self): + self.connector.create_bigram_table() + self.connector.insert_ngram(('der', 'linksdenker'), 22) + result = self.connector.execute_sql("SELECT * FROM _2_gram") + assert result == [('der', 'linksdenker', 22)] + self.connector.execute_sql("DROP TABLE _2_gram;") + + def test_update_ngram(self): + self.connector.create_bigram_table() + + # Insert + self.connector.insert_ngram(('der', 'linksdenker'), 22) + result = self.connector.execute_sql("SELECT * FROM _2_gram") + assert result == [('der', 'linksdenker', 22)] + + # Update + self.connector.update_ngram(('der', 'linksdenker'), 44) + result = self.connector.execute_sql("SELECT * FROM _2_gram") + assert result == [('der', 'linksdenker', 44)] + + self.connector.execute_sql("DROP TABLE _2_gram;") + + def test_ngram_count(self): + self.connector.create_bigram_table() + self.connector.insert_ngram(('der', 'linksdenker'), 22) + result = self.connector.ngram_count(('der', 'linksdenker')) + assert result == 22 + self.connector.execute_sql("DROP TABLE _2_gram;") + + def test_ngram_like_table(self): + self.connector.create_bigram_table() + self.connector.insert_ngram(('der', 'linksdenker'), 22) + self.connector.insert_ngram(('der', 'linksabbieger'), 32) + result = self.connector.ngram_like_table(('der', 'links')) + assert result == [('der', 'linksabbieger', 32), ( + 'der', 'linksdenker', 22)] + self.connector.execute_sql("DROP TABLE _2_gram;") + + # testing lowercase mode + self.connector.lowercase = True + self.connector.close_database() + self.connector.reset_database() + self.connector.open_database() + self.connector.create_bigram_table() + self.connector.insert_ngram(('Der', 'Linksdenker'), 22) + self.connector.insert_ngram(('Der', 'Linksabbieger'), 32) + result = self.connector.ngram_like_table(('der', 'links')) + assert result == [('Der', 'Linksabbieger', 32), ( + 'Der', 'Linksdenker', 22)] + self.connector.execute_sql("DROP TABLE _2_gram;") + + # testing normalize mode + self.connector.normalize = True + self.connector.close_database() + self.connector.reset_database() + self.connector.open_database() + self.connector.create_bigram_table() + self.connector.insert_ngram(('Der', 'Lünksdenker'), 22) + self.connector.insert_ngram(('Der', 'Lünksabbieger'), 32) + result = self.connector.ngram_like_table(('der', 'lunks')) + assert result == [('Der', 'Lünksabbieger', 32), ( + 'Der', 'Lünksdenker', 22)] + self.connector.execute_sql("DROP TABLE _2_gram;") + + self.connector.normalize = False + self.connector.lowercase = False + + def teardown(self): + self.connector.close_database() + con = psycopg2.connect(database="postgres", user="postgres") + con.set_isolation_level( + psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) + c = con.cursor() + c.execute("DROP DATABASE test;") + con.close() diff --git a/uberwriter_lib/pressagio/tests/test_predictor.py b/uberwriter_lib/pressagio/tests/test_predictor.py new file mode 100644 index 0000000..d4e3be3 --- /dev/null +++ b/uberwriter_lib/pressagio/tests/test_predictor.py @@ -0,0 +1,143 @@ +# -*- coding: utf-8 -*- +# +# Poio Tools for Linguists +# +# Copyright (C) 2009-2013 Poio Project +# Author: Peter Bouda +# URL: +# For license information, see LICENSE + +from __future__ import absolute_import, unicode_literals + +import os +try: + import configparser +except ImportError: + import ConfigParser as configparser + +import pressagio.predictor +import pressagio.tokenizer +import pressagio.dbconnector +import pressagio.context_tracker +import pressagio.callback + +class TestSuggestion(): + + def setup(self): + self.suggestion = pressagio.predictor.Suggestion("Test", 0.3) + + def test_probability(self): + self.suggestion.probability = 0.1 + assert self.suggestion.probability == 0.1 + + +class TestPrediction(): + + def setup(self): + self.prediction = pressagio.predictor.Prediction() + + def test_add_suggestion(self): + self.prediction.add_suggestion(pressagio.predictor.Suggestion( + "Test", 0.3)) + assert self.prediction[0].word == "Test" + assert self.prediction[0].probability == 0.3 + + self.prediction.add_suggestion(pressagio.predictor.Suggestion( + "Test2", 0.2)) + assert self.prediction[0].word == "Test" + assert self.prediction[0].probability == 0.3 + assert self.prediction[1].word == "Test2" + assert self.prediction[1].probability == 0.2 + + self.prediction.add_suggestion(pressagio.predictor.Suggestion( + "Test3", 0.6)) + assert self.prediction[0].word == "Test3" + assert self.prediction[0].probability == 0.6 + assert self.prediction[1].word == "Test" + assert self.prediction[1].probability == 0.3 + assert self.prediction[2].word == "Test2" + assert self.prediction[2].probability == 0.2 + + self.prediction[:] = [] + + def test_suggestion_for_token(self): + self.prediction.add_suggestion(pressagio.predictor.Suggestion( + "Token", 0.8)) + assert self.prediction.suggestion_for_token("Token").probability == 0.8 + self.prediction[:] = [] + +class StringStreamCallback(pressagio.callback.Callback): + + def __init__(self, stream): + pressagio.callback.Callback.__init__(self) + self.stream = stream + +class TestSmoothedNgramPredictor(): + + def setup(self): + self.dbfilename = os.path.abspath(os.path.join( + os.path.dirname( __file__ ), 'test_data', 'test.db')) + self.infile = os.path.abspath(os.path.join(os.path.dirname( __file__ ), + 'test_data', 'der_linksdenker.txt')) + + for ngram_size in range(3): + ngram_map = pressagio.tokenizer.forward_tokenize_file( + self.infile, ngram_size + 1, False) + pressagio.dbconnector.insert_ngram_map_sqlite(ngram_map, ngram_size + 1, + self.dbfilename, False) + + config_file = os.path.abspath(os.path.join(os.path.dirname( __file__ ), + 'test_data', 'profile_smoothedngram.ini')) + config = configparser.ConfigParser() + config.read(config_file) + config.set("Database", "database", self.dbfilename) + + self.predictor_registry = pressagio.predictor.PredictorRegistry(config) + + self.callback = StringStreamCallback("") + context_tracker = pressagio.context_tracker.ContextTracker( + config, self.predictor_registry, self.callback) + + def test_predict(self): + predictor = self.predictor_registry[0] + predictions = predictor.predict(6, None) + assert len(predictions) == 6 + words = [] + for p in predictions: + words.append(p.word) + assert "er" in words + assert "der" in words + assert "die" in words + assert "und" in words + assert "nicht" in words + + self.callback.stream="d" + predictions = predictor.predict(6, None) + assert len(predictions) == 6 + words = [] + for p in predictions: + words.append(p.word) + assert "der" in words + assert "die" in words + assert "das" in words + assert "da" in words + assert "Der" in words + + self.callback.stream="de" + predictions = predictor.predict(6, None) + assert len(predictions) == 6 + words = [] + for p in predictions: + words.append(p.word) + assert "der" in words + assert "Der" in words + assert "dem" in words + assert "den" in words + assert "des" in words + + def teardown(self): + if self.predictor_registry[0].db: + self.predictor_registry[0].db.close_database() + del(self.predictor_registry[0]) + if os.path.isfile(self.dbfilename): + os.remove(self.dbfilename) diff --git a/uberwriter_lib/pressagio/tests/test_tokenizer.py b/uberwriter_lib/pressagio/tests/test_tokenizer.py new file mode 100644 index 0000000..8813056 --- /dev/null +++ b/uberwriter_lib/pressagio/tests/test_tokenizer.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- +# +# Poio Tools for Linguists +# +# Copyright (C) 2009-2013 Poio Project +# Author: Peter Bouda +# URL: +# For license information, see LICENSE + +from __future__ import absolute_import, unicode_literals + +import os +import codecs + +import pressagio.tokenizer + + +class TestForwardTokenizer(): + + def setup(self): + filename = os.path.abspath(os.path.join(os.path.dirname( __file__ ), + 'test_data', 'der_linksdenker.txt')) + self.tokenizer = pressagio.tokenizer.ForwardTokenizer(filename) + + def test_reset_stream(self): + self.tokenizer.next_token() + assert self.tokenizer.offset != 0 + self.tokenizer.reset_stream() + assert self.tokenizer.offset == 0 + + def test_count_characters(self): + # TODO: Windows tokenization is different, check why + assert self.tokenizer.count_characters() == 7954 + + def test_count_tokens(self): + assert self.tokenizer.count_tokens() == 1235 + + def test_has_more_tokens(self): + assert self.tokenizer.has_more_tokens() == True + + def test_next_token(self): + assert self.tokenizer.next_token() == "Der" + self.tokenizer.reset_stream() + + def test_is_blankspace(self): + assert self.tokenizer.is_blankspace('\n') == True + assert self.tokenizer.is_blankspace('a') == False + + def test_is_separator(self): + assert self.tokenizer.is_separator('"') == True + assert self.tokenizer.is_separator('b') == False + + +class TestReverseTokenizer(): + + def setup(self): + filename = os.path.abspath(os.path.join(os.path.dirname( __file__ ), + 'test_data', 'der_linksdenker.txt')) + self.tokenizer = pressagio.tokenizer.ReverseTokenizer(filename) + + def test_reset_stream(self): + self.tokenizer.next_token() + assert self.tokenizer.offset != self.tokenizer.offend + self.tokenizer.reset_stream() + assert self.tokenizer.offset == self.tokenizer.offend + + def test_count_tokens(self): + assert self.tokenizer.count_tokens() == 1235 + + def test_has_more_tokens(self): + assert self.tokenizer.has_more_tokens() == True + + def test_next_token(self): + assert self.tokenizer.next_token() == "Linksdenker" + self.tokenizer.reset_stream() + + +def test_tokenizers_are_equal(): + filename = os.path.abspath(os.path.join(os.path.dirname( __file__ ), + 'test_data', 'der_linksdenker.txt')) + reverse_tokenizer = pressagio.tokenizer.ReverseTokenizer(filename) + forward_tokenizer = pressagio.tokenizer.ForwardTokenizer(filename) + forward_tokens = [] + reverse_tokens = [] + while forward_tokenizer.has_more_tokens(): + forward_tokens.append(forward_tokenizer.next_token()) + while reverse_tokenizer.has_more_tokens(): + reverse_tokens.append(reverse_tokenizer.next_token()) + diff = set(forward_tokens) ^ set(reverse_tokens) + assert forward_tokens == reverse_tokens[::-1] + assert len(diff) == 0 diff --git a/uberwriter_lib/pressagio/tokenizer.py b/uberwriter_lib/pressagio/tokenizer.py new file mode 100644 index 0000000..b9d32b7 --- /dev/null +++ b/uberwriter_lib/pressagio/tokenizer.py @@ -0,0 +1,289 @@ +# -*- coding: utf-8 -*- +# +# Poio Tools for Linguists +# +# Copyright (C) 2009-2013 Poio Project +# Author: Peter Bouda +# URL: +# For license information, see LICENSE + +""" +Several classes to tokenize text. + +""" + +from __future__ import absolute_import, unicode_literals + +import abc +import codecs +import collections + +from . import character + +class Tokenizer(object): + """ + Base class for all tokenizers. + + """ + + __metaclass__ = abc.ABCMeta + + def __init__(self, stream, blankspaces = character.blankspaces, + separators = character.separators): + """ + Constructor of the Tokenizer base class. + + Parameters + ---------- + stream : str or io.IOBase + The stream to tokenize. Can be a filename or any open IO stream. + + blankspaces : str + The characters that represent empty spaces. + + separators : str + The characters that separate token units (e.g. word boundaries). + + """ + self.separators = separators + self.blankspaces = blankspaces + self.lowercase = False + + self.offbeg = 0 + self.offset = None + self.offend = None + + def is_blankspace(self, char): + """ + Test if a character is a blankspace. + + Parameters + ---------- + char : str + The character to test. + + Returns + ------- + ret : bool + True if character is a blankspace, False otherwise. + + """ + if len(char) > 1: + raise TypeError("Expected a char.") + if char in self.blankspaces: + return True + return False + + def is_separator(self, char): + """ + Test if a character is a separator. + + Parameters + ---------- + char : str + The character to test. + + Returns + ------- + ret : bool + True if character is a separator, False otherwise. + + """ + if len(char) > 1: + raise TypeError("Expected a char.") + if char in self.separators: + return True + return False + + @abc.abstractmethod + def count_characters(self): + raise NotImplementedError("Method must be implemented") + + @abc.abstractmethod + def reset_stream(self): + raise NotImplementedError("Method must be implemented") + + @abc.abstractmethod + def count_tokens(self): + raise NotImplementedError("Method must be implemented") + + @abc.abstractmethod + def has_more_tokens(self): + raise NotImplementedError("Method must be implemented") + + @abc.abstractmethod + def next_token(self): + raise NotImplementedError("Method must be implemented") + + @abc.abstractmethod + def progress(self): + raise NotImplementedError("Method must be implemented") + + +class ForwardTokenizer(Tokenizer): + + def __init__(self, stream, blankspaces = character.blankspaces, + separators = character.separators): + Tokenizer.__init__(self, stream, blankspaces, separators) + if not hasattr(stream, 'read'): + stream = codecs.open(stream, "r", "utf-8") + self.text = stream.read() + stream.close() + + + self.offend = self.count_characters() - 1 + self.reset_stream() + + def count_tokens(self): + count = 0 + while(self.has_more_tokens()): + count += 1 + self.next_token() + + self.reset_stream() + + return count + + def count_characters(self): + """ + Counts the number of unicode characters in the IO stream. + + """ + return len(self.text) + + def has_more_tokens(self): + if self.offset < self.offend: + return True + return False + + def next_token(self): + current = self.text[self.offset] + self.offset += 1 + token = "" + + if self.offset <= self.offend: + while (self.is_blankspace(current) or self.is_separator(current)) \ + and self.offset < self.offend: + current = self.text[self.offset] + self.offset += 1 + + while not self.is_blankspace(current) and not self.is_separator( + current) and self.offset <= self.offend: + + if self.lowercase: + current = current.lower() + + token += current + + current = self.text[self.offset] + self.offset += 1 + + if self.offset > self.offend: + token += self.text[-1] + + + return token + + def progress(self): + return float(offset)/offend + + def reset_stream(self): + self.offset = 0 + + +class ReverseTokenizer(Tokenizer): + + def __init__(self, stream, blankspaces = character.blankspaces, + separators = character.separators): + Tokenizer.__init__(self, stream, blankspaces, separators) + if not hasattr(stream, 'read'): + stream = codecs.open(stream, "r", "utf-8") + self.text = stream.read() + stream.close() + + self.offend = self.count_characters() - 1 + self.offset = self.offend + + def count_tokens(self): + curroff = self.offset + self.offset = self.offend + count = 0 + while (self.has_more_tokens()): + self.next_token() + count += 1 + self.offset = curroff + return count + + def count_characters(self): + """ + Counts the number of unicode characters in the IO stream. + + """ + return len(self.text) + + def has_more_tokens(self): + if (self.offbeg <= self.offset): + return True + else: + return False + + def next_token(self): + token = "" + + while (self.offbeg <= self.offset) and len(token) == 0: + current = self.text[self.offset] + + if (self.offset == self.offend) and (self.is_separator(current) \ + or self.is_blankspace(current)): + self.offset -= 1 + return token + + while (self.is_blankspace(current) or self.is_separator(current)) \ + and self.offbeg < self.offset: + self.offset -= 1 + if (self.offbeg <= self.offset): + current = self.text[self.offset] + + while not self.is_blankspace(current) and not self.is_separator( + current) and self.offbeg <= self.offset: + if self.lowercase: + current = current.lower() + token = current + token + self.offset -= 1 + if (self.offbeg <= self.offset): + current = self.text[self.offset] + + return token + + def progress(self): + return float(self.offend - self.offset) / (self.offend - self.offbeg) + + def reset_stream(self): + self.offset = self.offend + + +def forward_tokenize_file(infile, ngram_size, lowercase=False, cutoff=0): + ngram_map = collections.defaultdict(int) + ngram_list = [] + tokenizer = ForwardTokenizer(infile) + tokenizer.lowercase = lowercase + + for i in range(ngram_size - 1): + if not tokenizer.has_more_tokens(): + break + ngram_list.append(tokenizer.next_token()) + + while (tokenizer.has_more_tokens()): + token = tokenizer.next_token() + ngram_list.append(token) + ngram_map[tuple(ngram_list)] += 1 + ngram_list.pop(0) + + ngram_map_tmp = dict() + if cutoff > 0: + for k in ngram_map.keys(): + if ngram_map[k] <= cutoff: + del(ngram_map[k]) + + return ngram_map +