added some of said experimental features

experimental
Wolf Vollprecht 2014-10-02 19:04:22 +02:00
parent 8fdc9f465a
commit 9807741b13
20 changed files with 2729 additions and 0 deletions

1
.gitignore vendored
View File

@ -1,4 +1,5 @@
build/lib.linux-x86_64-2.7
*.pyc
__pycache__/
build/scripts-2.7
build/share

View File

@ -0,0 +1,201 @@
# UberwriterAutoCorrect
# The Uberwriter Auto Correct is a auto correction
# mechanism to prevent stupid typos
# import presage
from gi.repository import Gtk, Gdk
import uberwriter_lib.pressagio as pressagio
import enchant
# d = enchant.Dict("de_DE")
import re
import uberwriter_lib.pressagio.predictor
import uberwriter_lib.pressagio.tokenizer
import uberwriter_lib.pressagio.dbconnector
import uberwriter_lib.pressagio.context_tracker
import uberwriter_lib.pressagio.callback
import xml.etree.ElementTree as ET
import pickle
from Levenshtein import distance
import configparser
from uberwriter_lib.helpers import get_media_path
# Define and create PresageCallback object
class PressagioCallback(pressagio.callback.Callback):
def __init__(self, buffer):
super().__init__()
self.buffer = buffer
def past_stream(self):
return self.buffer
def future_stream(self):
return ''
class UberwriterAutoCorrect:
def show_bubble(self, iter, suggestion):
self.suggestion = suggestion
if self.bubble:
self.bubble_label.set_text(suggestion)
else:
pos = self.TextView.get_iter_location(iter)
pos_adjusted = self.TextView.buffer_to_window_coords(Gtk.TextWindowType.TEXT, pos.x, pos.y + pos.height)
self.bubble = Gtk.Grid.new()
self.bubble.set_name("AutoCorrect")
self.TextView.add_child_in_window(self.bubble, Gtk.TextWindowType.TEXT, pos_adjusted[0], pos_adjusted[1])
self.bubble_label = Gtk.Label.new(suggestion)
self.bubble.attach(self.bubble_label, 0, 0, 1, 1)
close = Gtk.Image.new_from_icon_name('dialog-close', Gtk.IconSize.SMALL_TOOLBAR)
self.bubble.attach(close, 1, 0, 1, 1)
self.bubble.show_all()
def suggest(self, stump, context):
if self.enchant_dict.check(stump):
self.destroy_bubble()
return
self.callback.buffer = ' '.join(context) + ' ' + stump
self.callback.buffer = self.callback.buffer.lstrip().rstrip()
predictions = []
if self.use_pressagio:
predictions = self.prsgio.predict(6, None)
prediction = None
if not len(predictions):
if self.enchant_dict.check(stump):
self.destroy_bubble()
return
predictions = self.enchant_dict.suggest(stump)
suggestions_map = []
for suggestion in predictions:
if suggestion in self.frequency_dict:
suggestions_map.append({'suggestion': suggestion, 'freq': self.frequency_dict[suggestion]})
else:
suggestions_map.append({'suggestion': suggestion, 'freq': 0})
suggestions_map.sort(key= lambda x: x['freq'])
suggestions_map.reverse()
prediction = suggestions_map[0]
print(predictions)
prediction = predictions[0]
else:
prediction = predictions[0].word
anchor_iter = self.buffer.get_iter_at_mark(self.buffer.get_insert())
anchor_iter.backward_visible_word_start()
if len(stump) >= 1:
self.show_bubble(anchor_iter, prediction)
def destroy_bubble(self, *args):
if not self.bubble:
return
self.bubble.destroy()
self.bubble = None
self.suggestion = ''
def get_frequency_dict(self, language):
self.frequency_dict = {}
pp_pickled = get_media_path("frequency_dict_" + self.language + ".pickle")
if pp_pickled and os.path.isfile(pp_pickled):
f = open(pp_pickled, 'rb')
self.frequency_dict = pickle.load(f)
f.close()
else:
pp = get_media_path('wordlists/en_us_wordlist.xml')
frequencies = ET.parse(pp)
root = frequencies.getroot()
for child in root:
self.frequency_dict[child.text] = int(child.attrib['f'])
f = open('pickled_dict', 'wb+')
pickle.dump(self.frequency_dict, f)
f.close()
def accept_suggestion(self, append=""):
print("called")
curr_iter = self.buffer.get_iter_at_mark(self.buffer.get_insert())
start_iter = curr_iter.copy()
start_iter.backward_visible_word_start()
self.buffer.delete(start_iter, curr_iter)
self.buffer.insert_at_cursor(self.suggestion + append)
self.destroy_bubble()
def key_pressed(self, widget, event):
if not self.bubble:
return False
if event.keyval in [Gdk.KEY_Escape, Gdk.KEY_BackSpace]:
self.destroy_bubble()
return False
def text_insert(self, buffer, location,
text, len, data=None):
# check if at end of a word
# if yes, check if suggestion available
# then display suggetion
if self.suggestion and text in [' ', '\t', '\n', '.', '?', '!', ',', ';', '\'', '"', ')', ':']:
self.accept_suggestion(append=text)
location.assign(self.buffer.get_iter_at_mark(self.buffer.get_insert()))
elif location.ends_word():
iter_start = location.copy()
iter_start.backward_visible_word_starts(3)
text = buffer.get_text(iter_start, location, False)
words = text.split()
self.suggest(words[-1], words[0:-1])
def disable(self):
self.disabled = True
def enable(self):
self.disabled = False
def set_language(self, language):
print("Language changed to: %s" % language)
# handle 2 char cases e.g. "en"
if(len(language) == 2):
if "en":
language = "en_US"
if self.language == language:
return
else:
self.language = language
print("Language changing")
config_file = get_media_path("pressagio_config.ini")
pres_config = configparser.ConfigParser()
pres_config.read(config_file)
pres_config.set("Database", "database", get_media_path("corpora/" + self.language + ".sqlite"))
self.context_tracker = pressagio.context_tracker.ContextTracker(
pres_config, self.predictor_registry, self.callback)
self.prsgio = self.predictor_registry[0]
self.enchant_dict = enchant.Dict(self.language)
def __init__(self, textview, textbuffer):
self.TextView = textview
self.buffer = textbuffer
self.suggestion = ""
self.bubble = self.bubble_label = None
self.buffer.connect_after('insert-text', self.text_insert)
self.TextView.connect('key-press-event', self.key_pressed)
self.language = "en_US"
self.frequency_dict = {}
self.get_frequency_dict(self.language)
self.enchant_dict = enchant.Dict(self.language)
self.use_pressagio = False
config_file = get_media_path("pressagio_config.ini")
pres_config = configparser.ConfigParser()
pres_config.read(config_file)
pres_config.set("Database", "database", get_media_path("corpora/" + self.language + ".sqlite"))
self.callback = PressagioCallback("")
self.predictor_registry = pressagio.predictor.PredictorRegistry(pres_config)
self.context_tracker = pressagio.context_tracker.ContextTracker(
pres_config, self.predictor_registry, self.callback)
self.prsgio = self.predictor_registry[0]

View File

@ -0,0 +1 @@
0.1.3

View File

@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2009-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://media.cidles.eu/poio/>
# For license information, see LICENSE
from . import predictor
from . import context_tracker
class Pressagio:
def __init__(self, callback, config, dbconnection = None):
self.config = config
self.callback = callback
self.predictor_registry = pressagio.predictor.PredictorRegistry(
self.config, dbconnection)
self.context_tracker = pressagio.context_tracker.ContextTracker(
self.config, self.predictor_registry, callback)
self.predictor_activator = pressagio.predictor.PredictorActivator(
self.config, self.predictor_registry, self.context_tracker)
self.predictor_activator.combination_policy = "meritocracy"
def predict(self):
multiplier = 1
predictions = self.predictor_activator.predict(multiplier)
return [p.word for p in predictions]
def close_database(self):
self.predictor_registry.close_database()

View File

@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2009-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://media.cidles.eu/poio/>
# For license information, see LICENSE
"""
Base class for callbacks.
"""
from __future__ import absolute_import, unicode_literals
class Callback(object):
"""
Base class for callbacks.
"""
def __init__(self):
self.stream = ""
self.empty = ""
def past_stream(self):
return self.stream
def future_stream(self):
return self.empty
def update(self, character):
if character == "\b" and len(stream) > 0:
self.stream[:-1]
else:
self.stream += character

View File

@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2009-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://media.cidles.eu/poio/>
# For license information, see LICENSE
from __future__ import absolute_import, unicode_literals
import unicodedata
blankspaces = " \f\n\r\t\v…"
separators = "`~!@#$%^&*()_-+=\\|]}[{'\";:/?.>,<†„“।॥ו´י0123456789"
def first_word_character(string):
for i, ch in enumerate(string):
if is_word_character(ch):
return i
return -1
def last_word_character(string):
result = first_word_character(string[::-1])
if result == -1:
return -1
return len(string) - result - 1
def is_word_character(char):
# check for letter category
if unicodedata.category(char)[0] == "L":
return True
return False

View File

@ -0,0 +1,64 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2009-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://media.cidles.eu/poio/>
# For license information, see LICENSE
"""
Combiner classes to merge results from several predictors.
"""
from __future__ import absolute_import, unicode_literals
import abc
from . import predictor
class Combiner(object):
"""
Base class for all combiners
"""
__metaclass__ = abc.ABCMeta
def __init__(self):
pass
def filter(self, prediction):
seen_tokens = set()
result = predictor.Prediction()
for i, suggestion in enumerate(prediction):
token = suggestion.word
if token not in seen_tokens:
for j in range(i+1, len(prediction)):
if token == prediction[j].word:
# TODO: interpolate here?
suggestion.probability += prediction[j].probability
if suggestion.probability > \
predictor.MAX_PROBABILITY:
suggestion.probability = \
MAX_PROBABILITY
seen_tokens.add(token)
result.add_suggestion(suggestion)
return result
@abc.abstractmethod
def combine(self):
raise NotImplementedError("Method must be implemented")
class MeritocracyCombiner(Combiner):
def __init__(self):
pass
def combine(self, predictions):
result = predictor.Prediction()
for prediction in predictions:
for suggestion in prediction:
result.add_suggestion(suggestion)
return(self.filter(result))

View File

@ -0,0 +1,177 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2009-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://media.cidles.eu/poio/>
# For license information, see LICENSE
"""
Class for context tracker.
"""
from __future__ import absolute_import, unicode_literals
import copy
import io
from . import character
from . import observer
from . import tokenizer
DEFAULT_SLIDING_WINDOW_SIZE = 80
class InvalidCallbackException(Exception): pass
class ContextChangeDetector(object):
def __init__(self, lowercase):
self.lowercase = lowercase
self.sliding_windows_size = DEFAULT_SLIDING_WINDOW_SIZE
self.sliding_window = ""
def update_sliding_window(self, string):
if len(string) <= self.sliding_windows_size:
self.sliding_window = string
else:
self.sliding_window = string[:-self.sliding_windows_size]
def context_change(self, past_stream):
# rename for clarity
prev_context = self.sliding_window
curr_context = past_stream
if len(prev_context) == 0:
if len(curr_context) == 0:
return False
else:
return True
ctx_idx = curr_context.rfind(prev_context)
if ctx_idx == -1:
return True
remainder = curr_context[ctx_idx + len(prev_context):]
idx = character.last_word_character(remainder)
if idx == -1:
if len(remainder) == 0:
return False
last_char = curr_context[ctx_idx + len(prev_context) - 1]
if character.is_word_character(last_char):
return False
else:
return True
if idx == len(remainder) - 1:
return False
return True
def change(self, past_stream):
# rename for clarity
prev_context = self.sliding_window
curr_context = past_stream
if len(prev_context) == 0:
return past_stream
ctx_idx = curr_context.rfind(prev_context)
if ctx_idx == -1:
return past_stream
result = curr_context[ctx_idx + len(prev_context):]
if (self.context_change(past_stream)):
sliding_window_stream = self.sliding_window
r_tok = tokenizer.ReverseTokenizer(sliding_window_stream)
r_tok.lowercase = self.lowercase
first_token = r_tok.next_token()
if not len(first_token) == 0:
result = first_token + result
return result
class ContextTracker(object): #observer.Observer
"""
Tracks the current context.
"""
def __init__(self, config, predictor_registry, callback):
#self.dispatcher = observer.Dispatcher(self)
self.config = config
self.lowercase = self.config.getboolean("ContextTracker", "lowercase_mode")
self.registry = predictor_registry
if callback:
self.callback = callback
else:
raise InvalidCallbackException
self.context_change_detector = ContextChangeDetector(self.lowercase)
self.registry.context_tracker = self
self.sliding_windows_size = DEFAULT_SLIDING_WINDOW_SIZE
def context_change(self):
return self.context_change_detector.context_change(self.past_stream())
def update_context(self):
change = self.context_change_detector.change(self.past_stream())
tok = tokenizer.ForwardTokenizer(change)
tok.lowercase = self.lowercase
change_tokens = []
while(tok.has_more_tokens()):
token = tok.next_token()
change_tokens.append(token)
if len(change_tokens) != 0:
# remove prefix (partially entered token or empty token)
change_tokens.pop()
for predictor in self.predictor_registry:
predictor.learn(change_tokens)
self.context_change_detector.update_sliding_window(self.past_stream())
def prefix(self):
self.token(0)
def token(self, index):
past_string_stream = self.past_stream()
string_io = io.StringIO(past_string_stream)
tok = tokenizer.ReverseTokenizer(string_io)
tok.lowercase = self.lowercase
i = 0
while tok.has_more_tokens() and i <= index:
token = tok.next_token()
i += 1
if i <= index:
token = ""
return token
def extra_token_to_learn(self, index, change):
return self.token(index + len(change))
def future_stream(self):
return self.callback.future_stream()
def past_stream(self):
return self.callback.past_stream()
def is_completion_valid(self, completion):
prefix = self.prefix().lower()
if prefix in completion:
return True
return False
def __repr__(self):
return self.callback.past_stream + "<|>" + self.callback.future_stream \
+ "\n"
# def update(self, observable):
# self.dispatcher.dispatch(observable)

View File

@ -0,0 +1,745 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2001-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://www.cidles.eu/ltll/poio>
# For license information, see LICENSE
"""
Classes to connect to databases.
"""
from __future__ import absolute_import, unicode_literals
import abc
import sqlite3
import time
import re
import regex
try:
import psycopg2
psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)
psycopg2.extensions.register_type(psycopg2.extensions.UNICODEARRAY)
except ImportError:
pass
re_escape_singlequote = re.compile("'")
def _sqlite3_regex(expr, item):
return (not (not regex.search(expr, item)))
class DatabaseConnector(object):
"""
Base class for all database connectors.
"""
__metaclass__ = abc.ABCMeta
def __init__(self, dbname, cardinality = 1):
"""
Constructor of the base class DababaseConnector.
Parameters
----------
dbname : str
path to the database file or database name
cardinality : int
default cardinality for n-grams
"""
print("asjdas jdlkasj ljsa kdj lsakdj lk")
self.cardinality = cardinality
self.dbname = dbname
self.lowercase = False
self.normalize = False
def create_ngram_table(self, cardinality):
"""
Creates a table for n-gram of a give cardinality. The table name is
constructed from this parameter, for example for cardinality `2` there
will be a table `_2_gram` created.
Parameters
----------
cardinality : int
The cardinality to create a table for.
"""
query = "CREATE TABLE IF NOT EXISTS _{0}_gram (".format(cardinality)
unique = ""
for i in reversed(range(cardinality)):
if i != 0:
unique += "word_{0}, ".format(i)
query += "word_{0} TEXT, ".format(i)
else:
unique += "word"
query += "word TEXT, count INTEGER, UNIQUE({0}) );".format(
unique)
self.execute_sql(query)
def delete_ngram_table(self, cardinality):
"""
Deletes the table for n-gram of a give cardinality. The table name is
constructed from this parameter, for example for cardinality `2` there
will be a table `_2_gram` deleted.
Parameters
----------
cardinality : int
The cardinality of the table to delete.
"""
query = "DROP TABLE IF EXISTS _{0}_gram;".format(cardinality)
self.execute_sql(query)
def create_index(self, cardinality):
"""
Create an index for the table with the given cardinality.
Parameters
----------
cardinality : int
The cardinality to create a index for.
"""
for i in reversed(range(cardinality)):
if i != 0:
query = "CREATE INDEX idx_{0}_gram_{1} ON _{0}_gram(word_{1});".format(cardinality, i)
self.execute_sql(query)
def delete_index(self, cardinality):
"""
Delete index for the table with the given cardinality.
Parameters
----------
cardinality : int
The cardinality of the index to delete.
"""
for i in reversed(range(cardinality)):
if i != 0:
query = "DROP INDEX IF EXISTS idx_{0}_gram_{1};".format(
cardinality, i)
self.execute_sql(query)
def create_unigram_table(self):
"""
Creates a table for n-grams of cardinality 1.
"""
self.create_ngram_table(1)
def create_bigram_table(self):
"""
Creates a table for n-grams of cardinality 2.
"""
self.create_ngram_table(2)
def create_trigram_table(self):
"""
Creates a table for n-grams of cardinality 3.
"""
self.create_ngram_table(3)
def ngrams(self, with_counts=False):
"""
Returns all ngrams that are in the table.
Parameters
----------
None
Returns
-------
ngrams : generator
A generator for ngram tuples.
"""
query = "SELECT "
for i in reversed(range(self.cardinality)):
if i != 0:
query += "word_{0}, ".format(i)
elif i == 0:
query += "word"
if with_counts:
query += ", count"
query += " FROM _{0}_gram;".format(self.cardinality)
print(query)
result = self.execute_sql(query)
for row in result:
yield tuple(row)
def unigram_counts_sum(self):
query = "SELECT SUM(count) from _1_gram;"
result = self.execute_sql(query)
print(result, query)
return self._extract_first_integer(result)
def ngram_count(self, ngram):
"""
Gets the count for a given ngram from the database.
Parameters
----------
ngram : iterable of str
A list, set or tuple of strings.
Returns
-------
count : int
The count of the ngram.
"""
query = "SELECT count FROM _{0}_gram".format(len(ngram))
query += self._build_where_clause(ngram)
query += ";"
result = self.execute_sql(query)
return self._extract_first_integer(result)
def ngram_like_table(self, ngram, limit = -1):
print("NGRAM LIKE TABLE!\n\n\n")
query = "SELECT {0} FROM _{1}_gram {2} ORDER BY count DESC".format(
self._build_select_like_clause(len(ngram)), len(ngram),
self._build_where_like_clause(ngram))
print(query)
if limit < 0:
query += ";"
else:
query += " LIMIT {0};".format(limit)
return self.execute_sql(query)
def ngram_like_table_filtered(self, ngram, filter, limit = -1):
pass
def increment_ngram_count(self, ngram):
pass
def insert_ngram(self, ngram, count):
"""
Inserts a given n-gram with count into the database.
Parameters
----------
ngram : iterable of str
A list, set or tuple of strings.
count : int
The count for the given n-gram.
"""
query = "INSERT INTO _{0}_gram {1};".format(len(ngram),
self._build_values_clause(ngram, count))
self.execute_sql(query)
def update_ngram(self, ngram, count):
"""
Updates a given ngram in the database. The ngram has to be in the
database, otherwise this method will stop with an error.
Parameters
----------
ngram : iterable of str
A list, set or tuple of strings.
count : int
The count for the given n-gram.
"""
query = "UPDATE _{0}_gram SET count = {1}".format(len(ngram), count)
query += self._build_where_clause(ngram)
query += ";"
self.execute_sql(query)
def remove_ngram(self, ngram):
"""
Removes a given ngram from the databae. The ngram has to be in the
database, otherwise this method will stop with an error.
Parameters
----------
ngram : iterable of str
A list, set or tuple of strings.
"""
query = "DELETE FROM _{0}_gram".format(len(ngram))
query += self._build_where_clause(ngram)
query += ";"
self.execute_sql(query)
def open_database(self):
raise NotImplementedError("Method must be implemented")
def close_database(self):
raise NotImplementedError("Method must be implemented")
def execute_sql(self):
raise NotImplementedError("Method must be implemented")
############################################### Private methods
def _build_values_clause(self, ngram, count):
ngram_escaped = []
for n in ngram:
ngram_escaped.append(re_escape_singlequote.sub("''", n))
values_clause = "VALUES('"
values_clause += "', '".join(ngram_escaped)
values_clause += "', {0})".format(count)
return values_clause
def _build_where_clause(self, ngram):
where_clause = " WHERE"
for i in range(len(ngram)):
n = re_escape_singlequote.sub("''", ngram[i])
if i < (len(ngram) - 1):
where_clause += " word_{0} = '{1}' AND".format(
len(ngram) - i - 1, n)
else:
pattern = '(?:^%s){e<=%d}' % (n, 2)
where_clause += " word = '{0}'".format(n)
print(where_clause)
return where_clause
def _build_select_like_clause(self, cardinality):
result = ""
for i in reversed(range(cardinality)):
if i != 0:
result += "word_{0}, ". format(i)
else:
result += "word, count"
return result
def _build_where_like_clause(self, ngram):
where_clause = " WHERE"
for i in range(len(ngram)):
if i < (len(ngram) - 1):
where_clause += " word_{0} = '{1}' AND".format(
len(ngram) - i - 1, ngram[i])
else:
pattern = '(?:%s){e<=%d}' % (ngram[-1], 0)
where_clause += " (word regexp '%s')" % pattern
return where_clause
def _extract_first_integer(self, table):
count = 0
if len(table) > 0:
if len(table[0]) > 0:
count = int(table[0][0])
if not count > 0:
count = 0
return count
class SqliteDatabaseConnector(DatabaseConnector):
"""
Database connector for sqlite databases.
"""
def __init__(self, dbname, cardinality = 1):
"""
Constructor for the sqlite database connector.
Parameters
----------
dbname : str
path to the database file
cardinality : int
default cardinality for n-grams
"""
DatabaseConnector.__init__(self, dbname, cardinality)
self.con = None
self.open_database()
def commit(self):
"""
Sends a commit to the database.
"""
self.con.commit()
def open_database(self):
"""
Opens the sqlite database.
"""
self.con = sqlite3.connect(self.dbname)
self.con.create_function("regexp", 2, _sqlite3_regex)
def close_database(self):
"""
Closes the sqlite database.
"""
if self.con:
self.con.close()
def execute_sql(self, query):
"""
Executes a given query string on an open sqlite database.
"""
c = self.con.cursor()
c.execute(query)
result = c.fetchall()
return result
class PostgresDatabaseConnector(DatabaseConnector):
"""
Database connector for postgres databases.
"""
def __init__(self, dbname, cardinality = 1, host = "localhost", port = 5432,
user = "postgres", password = None, connection = None):
"""
Constructor for the postgres database connector.
Parameters
----------
dbname : str
the database name
cardinality : int
default cardinality for n-grams
host : str
hostname of the postgres database
port : int
port number of the postgres database
user : str
user name for the postgres database
password: str
user password for the postgres database
connection : connection
an open database connection
"""
DatabaseConnector.__init__(self, dbname, cardinality)
self.con = connection
self.host = host
self.port = port
self.user = user
self.password = password
def create_database(self):
"""
Creates an empty database if not exists.
"""
if not self._database_exists():
con = psycopg2.connect(host=self.host, database="postgres",
user=self.user, password=self.password, port=self.port)
con.set_isolation_level(
psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
query = "CREATE DATABASE {0};".format(self.dbname)
c = con.cursor()
c.execute(query)
con.close()
if self.normalize:
self.open_database()
query = "CREATE EXTENSION IF NOT EXISTS \"plperlu\";"
self.execute_sql(query)
# query = """CREATE OR REPLACE FUNCTION normalize(str text)
#RETURNS text
#AS $$
#import unicodedata
#return ''.join(c for c in unicodedata.normalize('NFKD', str)
#if unicodedata.category(c) != 'Mn')
#$$ LANGUAGE plpython3u IMMUTABLE;"""
# query = """CREATE OR REPLACE FUNCTION normalize(mystr text)
# RETURNS text
# AS $$
# from unidecode import unidecode
# return unidecode(mystr.decode("utf-8"))
# $$ LANGUAGE plpythonu IMMUTABLE;"""
query = """CREATE OR REPLACE FUNCTION normalize(text)
RETURNS text
AS $$
use Text::Unidecode;
return unidecode(shift);
$$ LANGUAGE plperlu IMMUTABLE;"""
self.execute_sql(query)
self.commit()
self.close_database()
def reset_database(self):
"""
Re-create an empty database.
"""
if self._database_exists():
con = psycopg2.connect(host=self.host, database="postgres",
user=self.user, password=self.password, port=self.port)
con.set_isolation_level(
psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
query = "DROP DATABASE {0};".format(self.dbname)
c = con.cursor()
c.execute(query)
con.close()
self.create_database()
def create_index(self, cardinality):
"""
Create an index for the table with the given cardinality.
Parameters
----------
cardinality : int
The cardinality to create a index for.
"""
DatabaseConnector.create_index(self, cardinality)
query = "CREATE INDEX idx_{0}_gram_varchar ON _{0}_gram(word varchar_pattern_ops);".format(cardinality)
self.execute_sql(query)
if self.lowercase:
for i in reversed(range(cardinality)):
if i != 0:
query = "CREATE INDEX idx_{0}_gram_{1}_lower ON _{0}_gram(LOWER(word_{1}));".format(cardinality, i)
self.execute_sql(query)
if self.normalize:
query = "CREATE INDEX idx_{0}_gram_lower_normalized_varchar ON _{0}_gram(NORMALIZE(LOWER(word)) varchar_pattern_ops);".format(cardinality)
self.execute_sql(query)
else:
query = "CREATE INDEX idx_{0}_gram_lower_varchar ON _{0}_gram(LOWER(word) varchar_pattern_ops);".format(cardinality)
self.execute_sql(query)
elif self.normalize:
query = "CREATE INDEX idx_{0}_gram_normalized_varchar ON _{0}_gram(NORMALIZE(word) varchar_pattern_ops);".format(cardinality)
self.execute_sql(query)
def delete_index(self, cardinality):
"""
Delete index for the table with the given cardinality.
Parameters
----------
cardinality : int
The cardinality of the index to delete.
"""
DatabaseConnector.delete_index(self, cardinality)
query = "DROP INDEX IF EXISTS idx_{0}_gram_varchar;".format(cardinality)
self.execute_sql(query)
query = "DROP INDEX IF EXISTS idx_{0}_gram_normalized_varchar;".format(
cardinality)
self.execute_sql(query)
query = "DROP INDEX IF EXISTS idx_{0}_gram_lower_varchar;".format(
cardinality)
self.execute_sql(query)
query = "DROP INDEX IF EXISTS idx_{0}_gram_lower_normalized_varchar;".\
format(cardinality)
self.execute_sql(query)
for i in reversed(range(cardinality)):
if i != 0:
query = "DROP INDEX IF EXISTS idx_{0}_gram_{1}_lower;".format(
cardinality, i)
self.execute_sql(query)
def commit(self):
"""
Sends a commit to the database.
"""
self.con.commit()
def open_database(self):
"""
Opens the sqlite database.
"""
if not self.con:
try:
self.con = psycopg2.connect(host=self.host,
database=self.dbname, user=self.user,
password=self.password, port=self.port)
except psycopg2.Error as e:
print("Error while opening database:")
print(e.pgerror)
def close_database(self):
"""
Closes the sqlite database.
"""
if self.con:
self.con.close()
self.con = None
def execute_sql(self, query):
"""
Executes a given query string on an open postgres database.
"""
c = self.con.cursor()
c.execute(query)
result = []
if c.rowcount > 0:
try:
result = c.fetchall()
except psycopg2.ProgrammingError:
pass
return result
############################################### Private methods
def _database_exists(self):
"""
Check if the database exists.
"""
con = psycopg2.connect(host=self.host, database="postgres",
user=self.user, password=self.password, port=self.port)
query_check = "select datname from pg_catalog.pg_database"
query_check += " where datname = '{0}';".format(self.dbname)
c = con.cursor()
c.execute(query_check)
result = c.fetchall()
if len(result) > 0:
return True
return False
def _build_where_like_clause(self, ngram):
where_clause = " WHERE"
for i in range(len(ngram)):
if i < (len(ngram) - 1):
if self.lowercase:
where_clause += " LOWER(word_{0}) = LOWER('{1}') AND".format(
len(ngram) - i - 1, ngram[i])
else:
where_clause += " word_{0} = '{1}' AND".format(
len(ngram) - i - 1, ngram[i])
else:
if ngram[-1] != "":
if self.lowercase:
if self. normalize:
where_clause += " NORMALIZE(LOWER(word)) LIKE NORMALIZE(LOWER('{0}%'))".format(ngram[-1])
else:
where_clause += " LOWER(word) LIKE LOWER('{0}%')".format(ngram[-1])
elif self.normalize:
where_clause += " NORMALIZE(word) LIKE NORMALIZE('{0}%')".format(ngram[-1])
else:
where_clause += " word LIKE '{0}%'".format(ngram[-1])
else:
# remove the " AND"
where_clause = where_clause[:-4]
return where_clause
#################################################### Functions
def insert_ngram_map_sqlite(ngram_map, ngram_size, outfile, append=False,
create_index=False):
sql = SqliteDatabaseConnector(outfile, ngram_size)
sql.create_ngram_table(ngram_size)
for ngram, count in ngram_map.items():
if append:
old_count = sql.ngram_count(ngram)
if old_count > 0:
sql.update_ngram(ngram, old_count + count)
else:
sql.insert_ngram(ngram, count)
else:
sql.insert_ngram(ngram, count)
sql.commit()
if create_index and not append:
sql.create_index(ngram_size)
sql.close_database()
def insert_ngram_map_postgres(ngram_map, ngram_size, dbname, append=False,
create_index=False, host = "localhost", port = 5432, user = "postgres",
password = None, lowercase = False, normalize = False):
sql = PostgresDatabaseConnector(dbname, ngram_size, host, port, user,
password)
sql.lowercase = lowercase
sql.normalize = normalize
sql.create_database()
sql.open_database()
if not append:
sql.delete_index(ngram_size)
sql.delete_ngram_table(ngram_size)
sql.create_ngram_table(ngram_size)
for ngram, count in ngram_map.items():
if append:
old_count = sql.ngram_count(ngram)
if old_count > 0:
sql.update_ngram(ngram, old_count + count)
else:
sql.insert_ngram(ngram, count)
else:
sql.insert_ngram(ngram, count)
sql.commit()
if create_index and not append:
sql.create_index(ngram_size)
sql.commit()
sql.close_database()
def _filter_ngrams(sql, dictionary):
for ngram in sql.ngrams():
delete_ngram = False
for word in ngram:
if not word in dictionary:
delete_ngram = True
if delete_ngram:
sql.remove_ngram(ngram)
def filter_ngrams_sqlite(dictionary, ngram_size, outfile):
sql = SqliteDatabaseConnector(outfile, ngram_size)
_filter_ngrams(sql, dictionary)
sql.commit()
sql.close_database()
def filter_ngrams_postgres(dictionary, ngram_size, dbname, host = "localhost",
port = 5432, user = "postgres", password = None):
sql = PostgresDatabaseConnector(dbname, ngram_size, host, port, user,
password)
sql.open_database()
_filter_ngrams(sql, dictionary)
sql.commit()
sql.close_database()

View File

@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2009-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://media.cidles.eu/poio/>
# For license information, see LICENSE
from __future__ import absolute_import, unicode_literals
import abc
class Observer(object):
"""
Base class for classes that want to observer other classes, e.g. the
PredictorActivator.
"""
__metaclass__ = abc.ABCMeta
@abc.abstractmethod
def update(self, observable):
raise NotImplementedError("Method must be implemented")
class Oberservable(object):
"""
Base class for everything that needs observation, e.g. the predictors.
"""
def __init__(self):
self._observers = []
def attach(self, observer):
if not observer in self._observers:
self._observers.append(observer)
def detach(self, observer):
try:
self._observers.remove(observer)
except ValueError:
pass
def notify(self, modifier=None):
for observer in self._observers:
if modifier != observer:
observer.update(self)
class Dispatcher(object):
"""
Dispatches observable notifications.
"""
def __init__(self, obj):
self.observables = []
self.dispatch_dict = {}
self.obj = obj
def map(self, observable, func):
observable.attach(obj)
self.observables.append(observable)
self.dispatch_dict[observable] = func
self.dispatch(observable)
def dispatch(self, observable):
handler_func = self.dispatch_dict[observable]
handler_func(observable)

View File

@ -0,0 +1,425 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2009-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://media.cidles.eu/poio/>
# For license information, see LICENSE
"""
Classes for predictors and to handle suggestions and predictions.
"""
from __future__ import absolute_import, unicode_literals
import os
try:
import configparser
except ImportError:
import ConfigParser as configparser
from . import dbconnector
#import pressagio.observer
MIN_PROBABILITY = 0.0
MAX_PROBABILITY = 1.0
class SuggestionException(Exception): pass
class UnknownCombinerException(Exception): pass
class PredictorRegistryException(Exception): pass
class Suggestion(object):
"""
Class for a simple suggestion, consists of a string and a probility for that
string.
"""
def __init__(self, word, probability):
print("I am a suggetsion")
self.word = word
self._probability = probability
def __eq__(self, other):
if self.word == other.word and self.probability == other.probability:
return True
return False
def __lt__(self, other):
if self.probability < other.probability:
return True
if self.probability == other.probability:
return self.word < other.word
return False
def __repr__(self):
return "Word: {0} - Probability: {1}".format(
self.word, self.probability)
def probability():
doc = "The probability property."
def fget(self):
return self._probability
def fset(self, value):
if value < MIN_PROBABILITY or value > MAX_PROBABILITY:
raise SuggestionException("Probability is too high or too low.")
self._probability = value
def fdel(self):
del self._probability
return locals()
probability = property(**probability())
class Prediction(list):
"""
Class for predictions from predictors.
"""
def __init__(self):
pass
def __eq__(self, other):
if self is other:
return True
if len(self) != len(other):
return False
for i, s in enumerate(other):
if not s == self[i]:
return False
return True
def suggestion_for_token(self, token):
for s in self:
if s.word == token:
return s
def add_suggestion(self, suggestion):
if len(self) == 0:
self.append(suggestion)
else:
i = 0
while i < len(self) and suggestion < self[i]:
i += 1
self.insert(i, suggestion)
class PredictorActivator(object):
"""
PredictorActivator starts the execution of the active predictors,
monitors their execution and collects the predictions returned, or
terminates a predictor's execution if it execedes its maximum
prediction time.
The predictions returned by the individual predictors are combined
into a single prediction by the active Combiner.
"""
def __init__(self, config, registry, context_tracker):
self.config = config
self.registry = registry
self.context_tracker = context_tracker
#self.dispatcher = pressagio.observer.Dispatcher(self)
self.predictions = []
self.combiner = None
self.max_partial_prediction_size = int(config.get(
"Selector", "suggestions"))
self.predict_time = None
self._combination_policy = None
def combination_policy():
doc = "The combination_policy property."
def fget(self):
return self._combination_policy
def fset(self, value):
self._combination_policy = value
if value.lower() == "meritocracy":
self.combiner = pressagio.combiner.MeritocracyCombiner()
else:
raise UnknownCombinerException()
def fdel(self):
del self._combination_policy
return locals()
combination_policy = property(**combination_policy())
def predict(self, multiplier = 1, prediction_filter = None):
self.predictions[:] = []
for predictor in self.registry:
self.predictions.append(predictor.predict(
self.max_partial_prediction_size * multiplier,
prediction_filter))
result = self.combiner.combine(self.predictions)
return result
class PredictorRegistry(list): #pressagio.observer.Observer,
"""
Manages instantiation and iteration through predictors and aids in
generating predictions and learning.
PredictorRegitry class holds the active predictors and provides the
interface required to obtain an iterator to the predictors.
The standard use case is: Predictor obtains an iterator from
PredictorRegistry and invokes the predict() or learn() method on each
Predictor pointed to by the iterator.
Predictor registry should eventually just be a simple wrapper around
plump.
"""
def __init__(self, config, dbconnection = None):
self.config = config
self.dbconnection = dbconnection
self._context_tracker = None
self.set_predictors()
def context_tracker():
doc = "The context_tracker property."
def fget(self):
return self._context_tracker
def fset(self, value):
if self._context_tracker is not value:
self._context_tracker = value
self[:] = []
self.set_predictors()
def fdel(self):
del self._context_tracker
return locals()
context_tracker = property(**context_tracker())
def set_predictors(self):
if (self.context_tracker):
self[:] = []
for predictor in self.config.get("PredictorRegistry", "predictors")\
.split():
self.add_predictor(predictor)
def add_predictor(self, predictor_name):
predictor = None
if self.config.get(predictor_name, "predictor_class") == \
"SmoothedNgramPredictor":
predictor = SmoothedNgramPredictor(self.config,
self.context_tracker, predictor_name,
dbconnection = self.dbconnection)
if predictor:
self.append(predictor)
def close_database(self):
for predictor in self:
predictor.close_database()
class Predictor(object):
"""
Base class for predictors.
"""
def __init__(self, config, context_tracker, predictor_name,
short_desc = None, long_desc = None):
self.short_description = short_desc
self.long_description = long_desc
self.context_tracker = context_tracker
self.name = predictor_name
self.config = config
def token_satifies_filter(token, prefix, token_filter):
if token_filter:
for char in token_filter:
candidate = prefix + char
if token.startswith(candidate):
return True
return False
class SmoothedNgramPredictor(Predictor): #, pressagio.observer.Observer
"""
Calculates prediction from n-gram model in sqlite database. You have to
create a database with the script `text2ngram` first.
"""
def __init__(self, config, context_tracker, predictor_name,
short_desc = None, long_desc = None, dbconnection = None):
Predictor.__init__(self, config, context_tracker, predictor_name,
short_desc, long_desc)
self.db = None
self.dbconnection = dbconnection
self.cardinality = None
self.learn_mode_set = False
self.dbclass = None
self.dbuser = None
self.dbpass = None
self.dbhost = None
self.dbport = None
self._database = None
self._deltas = None
self._learn_mode = None
self.config = config
self.name = predictor_name
self.context_tracker = context_tracker
self._read_config()
################################################## Properties
def deltas():
doc = "The deltas property."
def fget(self):
return self._deltas
def fset(self, value):
self._deltas = []
# make sure that values are floats
for i, d in enumerate(value):
self._deltas.append(float(d))
self.cardinality = len(value)
self.init_database_connector_if_ready()
def fdel(self):
del self._deltas
return locals()
deltas = property(**deltas())
def learn_mode():
doc = "The learn_mode property."
def fget(self):
return self._learn_mode
def fset(self, value):
self._learn_mode = value
self.learn_mode_set = True
self.init_database_connector_if_ready()
def fdel(self):
del self._learn_mode
return locals()
learn_mode = property(**learn_mode())
def database():
doc = "The database property."
def fget(self):
return self._database
def fset(self, value):
self._database = value
self.dbclass = self.config.get("Database", "class")
if self.dbclass == "PostgresDatabaseConnector":
self.dbuser = self.config.get("Database", "user")
self.dbpass = self.config.get("Database", "password")
self.dbhost = self.config.get("Database", "host")
self.dbport = self.config.get("Database", "port")
self.dblowercase = self.config.getboolean("Database",
"lowercase_mode")
self.dbnormalize = self.config.getboolean("Database",
"normalize_mode")
self.init_database_connector_if_ready()
def fdel(self):
del self._database
return locals()
database = property(**database())
#################################################### Methods
def init_database_connector_if_ready(self):
if self.database and len(self.database) > 0 and \
self.cardinality and self.cardinality > 0 and \
self.learn_mode_set:
if self.dbclass == "SqliteDatabaseConnector":
self.db = dbconnector.SqliteDatabaseConnector(
self.database, self.cardinality) #, self.learn_mode
elif self.dbclass == "PostgresDatabaseConnector":
self.db = dbconnector.PostgresDatabaseConnector(
self.database, self.cardinality, self.dbhost, self.dbport,
self.dbuser, self.dbpass, self.dbconnection)
self.db.lowercase = self.dblowercase
self.db.normalize = self.dbnormalize
self.db.open_database()
def ngram_to_string(self, ngram):
"|".join(ngram)
def predict(self, max_partial_prediction_size, filter):
print("SmoothedNgramPredictor Predicting")
print(filter)
tokens = [""] * self.cardinality
prediction = Prediction()
for i in range(self.cardinality):
tokens[self.cardinality - 1 - i] = self.context_tracker.token(i)
prefix_completion_candidates = []
for k in reversed(range(self.cardinality)):
if len(prefix_completion_candidates) >= max_partial_prediction_size:
break
prefix_ngram = tokens[(len(tokens) - k - 1):]
partial = None
if not filter:
partial = self.db.ngram_like_table(prefix_ngram,
max_partial_prediction_size - \
len(prefix_completion_candidates))
else:
partial = db.ngram_like_table_filtered(prefix_ngram, filter,
max_partial_prediction_size - \
len(prefix_completion_candidates))
print((partial))
for p in partial:
if len(prefix_completion_candidates) > \
max_partial_prediction_size:
break
candidate = p[-2] # ???
if candidate not in prefix_completion_candidates:
prefix_completion_candidates.append(candidate)
# smoothing
unigram_counts_sum = self.db.unigram_counts_sum()
for j, candidate in enumerate(prefix_completion_candidates):
#if j >= max_partial_prediction_size:
# break
tokens[self.cardinality - 1] = candidate
probability = 0
for k in range(self.cardinality):
numerator = self._count(tokens, 0, k + 1)
denominator = unigram_counts_sum
if numerator > 0:
denominator = self._count(tokens, -1, k)
frequency = 0
if denominator > 0:
frequency = float(numerator) / denominator
probability += self.deltas[k] * frequency
if probability > 0:
prediction.add_suggestion(Suggestion(tokens[self.cardinality - 1],
probability))
return(prediction)
def close_database(self):
self.db.close_database()
################################################ Private methods
def _read_config(self):
self.database = self.config.get("Database", "database")
self.deltas = self.config.get(self.name, "deltas").split()
self.learn_mode = self.config.get(self.name, "learn")
def _count(self, tokens, offset, ngram_size):
result = 0
if (ngram_size > 0):
ngram = \
tokens[len(tokens) - ngram_size + offset:\
len(tokens) + offset]
result = self.db.ngram_count(ngram)
else:
result = self.db.unigram_counts_sum()
return result

View File

@ -0,0 +1,8 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2009-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://media.cidles.eu/poio/>
# For license information, see LICENSE

View File

@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2009-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://media.cidles.eu/poio/>
# For license information, see LICENSE
from __future__ import absolute_import, unicode_literals
import pressagio.character
def test_first_word_character():
assert pressagio.character.first_word_character("8238$§(a)jaj2u2388!") == 7
assert pressagio.character.first_word_character("123üäö34ashdh") == 3
assert pressagio.character.first_word_character("123&(/==") == -1
def test_last_word_character():
assert pressagio.character.last_word_character("8238$§(a)jaj2u2388!") == 13
assert pressagio.character.last_word_character("123üäö34ashdh") == 12
assert pressagio.character.last_word_character("123&(/==") == -1
def test_is_word_character():
assert pressagio.character.is_word_character("ä") == True
assert pressagio.character.is_word_character("1") == False
assert pressagio.character.is_word_character(".") == False

View File

@ -0,0 +1,74 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2009-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://media.cidles.eu/poio/>
# For license information, see LICENSE
from __future__ import absolute_import, unicode_literals
import pressagio.predictor
import pressagio.combiner
class TestMeritocracyCombiner:
def setup(self):
self.combiner = pressagio.combiner.MeritocracyCombiner()
def _create_prediction(self):
prediction = pressagio.predictor.Prediction()
prediction.add_suggestion(pressagio.predictor.Suggestion(
"Test", 0.3))
prediction.add_suggestion(pressagio.predictor.Suggestion(
"Test2", 0.3))
prediction.add_suggestion(pressagio.predictor.Suggestion(
"Test", 0.1))
prediction.add_suggestion(pressagio.predictor.Suggestion(
"Test3", 0.2))
return prediction
def _create_prediction2(self):
prediction = pressagio.predictor.Prediction()
prediction.add_suggestion(pressagio.predictor.Suggestion(
"Test2", 0.3))
prediction.add_suggestion(pressagio.predictor.Suggestion(
"Test", 0.1))
prediction.add_suggestion(pressagio.predictor.Suggestion(
"Test3", 0.2))
return prediction
def test_filter(self):
result = self.combiner.filter(
self._create_prediction())
correct = pressagio.predictor.Prediction()
correct.add_suggestion(pressagio.predictor.Suggestion(
"Test3", 0.2))
correct.add_suggestion(pressagio.predictor.Suggestion(
"Test2", 0.3))
correct.add_suggestion(pressagio.predictor.Suggestion(
"Test", 0.4))
assert result == correct
def test_combine(self):
predictions = [ self._create_prediction2() ]
prediction2 = self._create_prediction2()
prediction2.add_suggestion(pressagio.predictor.Suggestion(
"Test4", 0.1))
predictions.append(prediction2)
result = self.combiner.combine(predictions)
correct = pressagio.predictor.Prediction()
correct.add_suggestion(pressagio.predictor.Suggestion(
"Test3", 0.4))
correct.add_suggestion(pressagio.predictor.Suggestion(
"Test2", 0.6))
correct.add_suggestion(pressagio.predictor.Suggestion(
"Test4", 0.1))
correct.add_suggestion(pressagio.predictor.Suggestion(
"Test", 0.2))
assert result == correct

View File

@ -0,0 +1,28 @@
Der Linksdenker von Peter Panter
"Er ist ein Gespenst und doch ein Münchner."
Alfred Polgar
Das war ein heiterer Abschied von Berlin: sechs Wochen Panke und ein Abend Karl Valentin die Rechnung ging ohne Rest auf.
Ich kam zu spät ins Theater, der Saal war bereits warm und voll Lachen. Es mochte grade begonnen haben, aber die Leute waren animiert und vergnügt wie sonst erst nach dem zweiten Akt. Am Podium der Bühne auf der Bühne, mitten in der Vorstadtkapelle, saß ein Mann mit einer aufgeklebten Perücke, er sah aus, wie man sich sonst wohl einen Provinzkomiker vorstellt: ich blickte angestrengt auf die Szene und wußte beim besten Willen nicht, was es da wohl zu lachen gäbe … Aber die Leute lachten wieder, und der Mann hatte doch gar nichts gesagt ... Und plötzlich schweifte mein Auge ab, vorn in der ersten Reihe saß noch Einer, den hatte ich bisher nicht bemerkt, und das war: ER.
Ein zaundürrer, langer Geselle, mit langen, spitzen Don-Quichotte-Beinen, mit winkligen, spitzigen Knien, einem Löchlein in der Hose, mit blankem, abgeschabtem Anzug. Sein Löchlein in der Hose er reibt eifrig daran herum. "Das wird Ihnen nichts nützen!" sagt der gestrenge Orchesterchef. Er, leise vor sich hin: "Mit Benzin wärs scho fort!" Leise sagt er das, leise, wie seine schauspielerischen Mittel. Er ist sanft und zerbrechlich, schillert in allen Farben wie eine Seifenblase; wenn er plötzlich zerplatzte, hätte sich Niemand zu wundern.
"Fertig!" klopft der Kapellmeister. Eins, zwei, drei da, einen Sechzehnteltakt zuvor, setzte der dürre Bläser ab und bedeutete dem Kapellmeister mit ernstem Zeigefinger: "s Krawattl rutscht Ihna heraus!" Aergerlich stopft sich der das Ding hinein. "Fertig!" Eins, zwei, drei … So viel, wie ein Auge Zeit braucht, die Wimper zu heben und zu senken, trennte die Kapelle noch von dem schmetternden Tusch da setzte der Lange ab und sah um sich. Der Kapellmeister klopfte ab. Was es nun wieder gäbe ? "Ich muß mal husten!" sagte der Lange. Pause. Das Orchester wartet. Aber nun kann er nicht. Eins, zwei, drei tätärätä! Es geht los.
Und es beginnt die seltsamste Komik, die wir seit langem auf der Bühne gesehen haben: ein Höllentanz der Vernunft um beide Pole des Irrsinns. Das ist eine kleine Seele, dieser Bläser, mit Verbandsorgan, Tarif, Stammtisch und Kollegenklatsch. Er ist ängstlich auf seinen vereinbarten Verdienst und ein bißchen darüber hinaus auf seinen Vorteil bedacht. "Spielen Sie genau, was da steht," sagt der Kapellmeister, "nicht zu viel und nicht zu wenig!" "Zu viel schon gar nicht!" sagt das Verbandsmitglied.
Oben auf der Bühne will der Vorhang nicht auseinander. "Geh mal sofort einer zum Tapezierer", sagt der Kapellmeister, "aber sofort, und sag ihm, er soll gelegentlich, wenn er Zeit hat, vorbeikommen." Geschieht. Der Tapezierer scheint sofort Zeit zu haben, denn er kommt mitten in die Sängerin hineingeplatzt. Steigt mit der Leiter auf die Bühne "Zu jener Zeit, wie liebt ich dich, mein Leben", heult die Sängerin und packt seine Instrumente aus, klopft, hämmert, macht … Seht doch Valentin! Er ist nicht zu halten. Was gibt es da? Was mag da sein? Er hat die Neugier der kleinen Leute. Immer geigend, denn das ist seine bezahlte Pflicht, richtet er sich hoch, steigt auf den Stuhl, reckt zwei Hälse, den seinen und den der Geige, klettert wieder herunter, schreitet durch das Orchester, nach oben auf die Bühne, steigt dort dem Tapezierer auf seiner Leiter nach, geigt und sieht, arbeitet und guckt, was es da Interessantes gibt … Ich muß lange zurückdenken, um mich zu erinnern, wann in einem Theater so gelacht worden ist.
Er denkt links. Vor Jahren hat er einmal in München in einem Bierkeller gepredigt: "Vorgestern bin ich mit meiner Großmutter in der Oper Lohengrin gewesen. Gestern nacht hat sie die ganze Oper nochmal geträumt; das wann i gwußt hätt, hätten wir gar nicht erst hingehen brauchen!"
Aber dieser Schreiber, der sich abends sein Brot durch einen kleinen Nebenverdienst aufbessert, wird plötzlich transparent, durchsichtig, über- und unterirdisch und beginnt zu leuchten. Berühren diese langen Beine noch die Erde?
Es erhebt sich das schwere Problem, eine Pauke von einem Ende der Bühne nach dem andern zu schaffen. Der Auftrag fällt auf Valentin. "I bin eigentlich a Bläser!" sagt er. Bläser schaffen keine Pauken fort. Aber, na … Laatscht hin. Allein geht es nicht. Sein Kollege soll helfen. Und hier wird die Sache durchaus mondsüchtig. "Schafft die Pauke her!" ruft der Kapellmeister ungeduldig. Der Kollege kneetscht in seinen Bart: "Muß das gleich sein?" Der Kapellmeister: "Bringt die Pauke her!" Valentin: "Der Andre laßt fragen, wann." "Der Andre" nicht: Peperl oder: Herr Schmidt oder: Kollege Hintermüller, sondern: der Andre. Der Andre wird Schicksal, Moira und nachbarlicher Kosmos. Sie drehen sich eine Weile um die Pauke, schließlich sagt "der Andre", er müsse hier stehen, denn er sei Linkshänder. Linkshänder? Vergessen sind Pauke, Kapellmeister und Theateraufführung Linkshänder! Und nun, ganz Shakespearisch: "Linkshänder bist? Alles links? Beim Schreiben auch? Beim Essen auch? Beim Schlucken auch? Beim Denken auch?" Und dann triumphierend: "Der Andre sagt, er ist links!" Welche Distanz ist da vom "Andern" wie diesseits ist man selbst, wie jenseits der Andre, wie verschieden, wie getrennt, wie weitab! Mitmensch? Nebenmensch.
Sicherlich legen wir hier das Philosophische hinein. Sicherlich hat Valentin theoretisch diese Gedankengänge nicht gehabt. Aber man zeige uns doch erst einmal einen Komiker, ein Gefäß, in das man so etwas hineinlegen kann. Bei Herrn Westermeier käme man nicht auf solche Gedanken. Hier aber erhebt sich zum Schluß eine Unterhaltung über den Zufall, ein Hin und Her, kleine magische Funken, die aus einem merkwürdig konstruierten Gehirn sprühen. Er sei Unter den Linden spaziert, mit dem Nebenmann, da hätten sie von einem Radfahrer gesprochen und da sei gerade einer des Wegs gekommen. Dies zum Kapitel: Zufall. Der Kapellmeister tobt. Das sei kein Zufall das sei Unsinn. Da kämen tausend Radfahrer täglich vorbei. "Na ja", sagt Valentin, "aber es ist grad Einer kumma!" Unvorstellbar, wie so etwas ausgedacht, geschrieben, probiert wird. Die Komik der irrealen Potentialsätze, die monströse Zerlegung des Satzes: "Ich sehe, daß er nicht da ist!" (was sich da erhebt, ist überhaupt nicht zu sagen!) die stille Dummheit dieses Witzes, der irrational ist und die leise Komponente des korrigierenden Menschenverstandes nicht aufweist, zwischendurch trinkt er aus einem Seidel Bier, kaut etwas, das er in der Tasche aufbewahrt hatte, denkt mit dem Zeigefinger und hat seine kleine Privatfreude, wenn sich der Kapellmeister geirrt hat. Eine kleine Seele. Als Hans Reimann einmal eine Rundfrage stellte, was sich Jedermann wünschen würde, wenn ihm eine Fee drei Wünsche freistellte, hat Karl Valentin geantwortet: "1.) Ewige Gesundheit. 2.) Einen Leibarzt." Eine kleine Seele.
Und ein großer Künstler. Wenn ihn nur nicht die berliner Unternehmer einfangen möchten! Das Geheimnis dieses primitiven Ensembles ist seine kräftige Naivität. Das ist eben so, und wems nicht paßt, der soll nicht zuschauen. Gott behüte, wenn man den zu Duetten und komischen Couplets abrichtete! Mit diesen verdrossenen, verquälten, nervösen Regisseuren und Direktoren auf der Probe, die nicht zuhören und zunächst einmal zu Allem Nein sagen. Mit diesem Drum und Dran von unangenehmen berliner Typen, die vorgeben, zu wissen, was das Publikum will, mit dem sie ihren nicht sehr heitern Kreis identifizieren, mit diesen überarbeiteten und unfrohen Gesellen, die nicht mehr fähig sind, von Herzen über das Einfache zu lachen, "weil es schon dagewesen ist". Sie jedenfalls sind immer schon dagewesen. Karl Valentin aber nur ein Mal, weil er ein seltener, trauriger, unirdischer, maßlos lustiger Komiker ist, der links denkt.
Quelle: http://de.wikisource.org/wiki/Der_Linksdenker

View File

@ -0,0 +1,26 @@
# Template for profiles
[Database]
class = SqliteDatabaseConnector
database = c:/Users/Peter/Projects/git-github/pressagio/src/pressagio/tests/test_data/test.db
[PredictorRegistry]
predictors = DefaultSmoothedNgramPredictor
[DefaultSmoothedNgramPredictor]
predictor_class = SmoothedNgramPredictor
deltas = 0.01 0.1 0.89
learn = True
[ContextTracker]
sliding_window_size = 80
lowercase_mode = True
[Selector]
suggestions = 6
repeat_suggestions = no
greedy_suggestion_threshold = 0
[PredictorActivator]
predict_time = 100
max_partial_prediction_size = 60
combination_policy = Meritocracy

View File

@ -0,0 +1,253 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2001-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://www.cidles.eu/ltll/poio>
# For license information, see LICENSE
from __future__ import absolute_import, unicode_literals
import os
import pressagio.dbconnector
psycopg2_installed = False
try:
import psycopg2
psycopg2_installed = True
except ImportError:
pass
class TestSqliteDatabaseConnector():
def setup(self):
self.filename = os.path.abspath(os.path.join(os.path.dirname( __file__ ),
'test_data', 'test.db'))
self.connector = pressagio.dbconnector.SqliteDatabaseConnector(self.filename)
self.connector.open_database()
def test_execute_sql(self):
self.connector.execute_sql("CREATE TABLE IF NOT EXISTS test ( c1 TEXT, c2 INTEGER );")
def test_create_ngram_table(self):
self.connector.create_ngram_table(1)
result = self.connector.execute_sql(
"SELECT name FROM sqlite_master WHERE type='table' AND name='_1_gram';")
assert result == [('_1_gram',)]
self.connector.execute_sql("DROP TABLE _1_gram;")
self.connector.create_ngram_table(2)
result = self.connector.execute_sql(
"SELECT name FROM sqlite_master WHERE type='table' AND name='_2_gram';")
assert result == [('_2_gram',)]
self.connector.execute_sql("DROP TABLE _2_gram;")
self.connector.create_ngram_table(3)
result = self.connector.execute_sql(
"SELECT name FROM sqlite_master WHERE type='table' AND name='_3_gram';")
assert result == [('_3_gram',)]
self.connector.execute_sql("DROP TABLE _3_gram;")
def test_create_index(self):
self.connector.create_ngram_table(2)
self.connector.insert_ngram(('der', 'linksdenker'), 22)
self.connector.create_index(2)
result = self.connector.execute_sql(
"SELECT name FROM sqlite_master WHERE type='index' \
AND name='idx_2_gram_1';")
assert result == [('idx_2_gram_1',)]
self.connector.execute_sql("DROP TABLE _2_gram;")
def test_create_unigram_table(self):
self.connector.create_unigram_table()
result = self.connector.execute_sql(
"SELECT name FROM sqlite_master WHERE type='table' AND name='_1_gram';")
assert result == [('_1_gram',)]
self.connector.execute_sql("DROP TABLE _1_gram;")
def test_create_bigram_table(self):
self.connector.create_bigram_table()
result = self.connector.execute_sql(
"SELECT name FROM sqlite_master WHERE type='table' AND name='_2_gram';")
assert result == [('_2_gram',)]
self.connector.execute_sql("DROP TABLE _2_gram;")
def test_create_trigram_table(self):
self.connector.create_trigram_table()
result = self.connector.execute_sql(
"SELECT name FROM sqlite_master WHERE type='table' AND name='_3_gram';")
assert result == [('_3_gram',)]
self.connector.execute_sql("DROP TABLE _3_gram;")
def test_insert_ngram(self):
self.connector.create_bigram_table()
self.connector.insert_ngram(('der', 'linksdenker'), 22)
result = self.connector.execute_sql("SELECT * FROM _2_gram")
assert result == [('der', 'linksdenker', 22)]
self.connector.execute_sql("DROP TABLE _2_gram;")
def test_update_ngram(self):
self.connector.create_bigram_table()
# Insert
self.connector.insert_ngram(('der', 'linksdenker'), 22)
result = self.connector.execute_sql("SELECT * FROM _2_gram")
assert result == [('der', 'linksdenker', 22)]
# Update
self.connector.update_ngram(('der', 'linksdenker'), 44)
result = self.connector.execute_sql("SELECT * FROM _2_gram")
assert result == [('der', 'linksdenker', 44)]
self.connector.execute_sql("DROP TABLE _2_gram;")
def test_ngram_count(self):
self.connector.create_bigram_table()
self.connector.insert_ngram(('der', 'linksdenker'), 22)
result = self.connector.ngram_count(('der', 'linksdenker'))
assert result == 22
self.connector.execute_sql("DROP TABLE _2_gram;")
def test_ngram_like_table(self):
self.connector.create_bigram_table()
self.connector.insert_ngram(('der', 'linksdenker'), 22)
self.connector.insert_ngram(('der', 'linksabbieger'), 32)
result = self.connector.ngram_like_table(('der', 'links'))
assert result == [('der', 'linksabbieger', 32), (
'der', 'linksdenker', 22)]
self.connector.execute_sql("DROP TABLE _2_gram;")
def teardown(self):
self.connector.close_database()
if os.path.isfile(self.filename):
os.remove(self.filename)
if psycopg2_installed:
class TestPostgresDatabaseConnector():
def setup(self):
self.connector = pressagio.dbconnector.PostgresDatabaseConnector("test")
self.connector.create_database()
self.connector.open_database()
def test_create_database(self):
self.connector.create_database()
def test_create_ngram_table(self):
self.connector.create_ngram_table(1)
result = self.connector.execute_sql(
"SELECT * FROM information_schema.tables WHERE table_name='_1_gram';")
assert len(result) == 1
self.connector.execute_sql("DROP TABLE _1_gram;")
self.connector.create_ngram_table(2)
result = self.connector.execute_sql(
"SELECT * FROM information_schema.tables WHERE table_name='_2_gram';")
assert len(result) == 1
self.connector.execute_sql("DROP TABLE _2_gram;")
self.connector.create_ngram_table(3)
result = self.connector.execute_sql(
"SELECT * FROM information_schema.tables WHERE table_name='_3_gram';")
assert len(result) == 1
self.connector.execute_sql("DROP TABLE _3_gram;")
def test_create_unigram_table(self):
self.connector.create_unigram_table()
result = self.connector.execute_sql(
"SELECT * FROM information_schema.tables WHERE table_name='_1_gram';")
assert len(result) == 1
self.connector.execute_sql("DROP TABLE _1_gram;")
def test_create_bigram_table(self):
self.connector.create_bigram_table()
result = self.connector.execute_sql(
"SELECT * FROM information_schema.tables WHERE table_name='_2_gram';")
assert len(result) == 1
self.connector.execute_sql("DROP TABLE _2_gram;")
def test_create_trigram_table(self):
self.connector.create_trigram_table()
result = self.connector.execute_sql(
"SELECT * FROM information_schema.tables WHERE table_name='_3_gram';")
assert len(result) == 1
self.connector.execute_sql("DROP TABLE _3_gram;")
def test_insert_ngram(self):
self.connector.create_bigram_table()
self.connector.insert_ngram(('der', 'linksdenker'), 22)
result = self.connector.execute_sql("SELECT * FROM _2_gram")
assert result == [('der', 'linksdenker', 22)]
self.connector.execute_sql("DROP TABLE _2_gram;")
def test_update_ngram(self):
self.connector.create_bigram_table()
# Insert
self.connector.insert_ngram(('der', 'linksdenker'), 22)
result = self.connector.execute_sql("SELECT * FROM _2_gram")
assert result == [('der', 'linksdenker', 22)]
# Update
self.connector.update_ngram(('der', 'linksdenker'), 44)
result = self.connector.execute_sql("SELECT * FROM _2_gram")
assert result == [('der', 'linksdenker', 44)]
self.connector.execute_sql("DROP TABLE _2_gram;")
def test_ngram_count(self):
self.connector.create_bigram_table()
self.connector.insert_ngram(('der', 'linksdenker'), 22)
result = self.connector.ngram_count(('der', 'linksdenker'))
assert result == 22
self.connector.execute_sql("DROP TABLE _2_gram;")
def test_ngram_like_table(self):
self.connector.create_bigram_table()
self.connector.insert_ngram(('der', 'linksdenker'), 22)
self.connector.insert_ngram(('der', 'linksabbieger'), 32)
result = self.connector.ngram_like_table(('der', 'links'))
assert result == [('der', 'linksabbieger', 32), (
'der', 'linksdenker', 22)]
self.connector.execute_sql("DROP TABLE _2_gram;")
# testing lowercase mode
self.connector.lowercase = True
self.connector.close_database()
self.connector.reset_database()
self.connector.open_database()
self.connector.create_bigram_table()
self.connector.insert_ngram(('Der', 'Linksdenker'), 22)
self.connector.insert_ngram(('Der', 'Linksabbieger'), 32)
result = self.connector.ngram_like_table(('der', 'links'))
assert result == [('Der', 'Linksabbieger', 32), (
'Der', 'Linksdenker', 22)]
self.connector.execute_sql("DROP TABLE _2_gram;")
# testing normalize mode
self.connector.normalize = True
self.connector.close_database()
self.connector.reset_database()
self.connector.open_database()
self.connector.create_bigram_table()
self.connector.insert_ngram(('Der', 'Lünksdenker'), 22)
self.connector.insert_ngram(('Der', 'Lünksabbieger'), 32)
result = self.connector.ngram_like_table(('der', 'lunks'))
assert result == [('Der', 'Lünksabbieger', 32), (
'Der', 'Lünksdenker', 22)]
self.connector.execute_sql("DROP TABLE _2_gram;")
self.connector.normalize = False
self.connector.lowercase = False
def teardown(self):
self.connector.close_database()
con = psycopg2.connect(database="postgres", user="postgres")
con.set_isolation_level(
psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
c = con.cursor()
c.execute("DROP DATABASE test;")
con.close()

View File

@ -0,0 +1,143 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2009-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://media.cidles.eu/poio/>
# For license information, see LICENSE
from __future__ import absolute_import, unicode_literals
import os
try:
import configparser
except ImportError:
import ConfigParser as configparser
import pressagio.predictor
import pressagio.tokenizer
import pressagio.dbconnector
import pressagio.context_tracker
import pressagio.callback
class TestSuggestion():
def setup(self):
self.suggestion = pressagio.predictor.Suggestion("Test", 0.3)
def test_probability(self):
self.suggestion.probability = 0.1
assert self.suggestion.probability == 0.1
class TestPrediction():
def setup(self):
self.prediction = pressagio.predictor.Prediction()
def test_add_suggestion(self):
self.prediction.add_suggestion(pressagio.predictor.Suggestion(
"Test", 0.3))
assert self.prediction[0].word == "Test"
assert self.prediction[0].probability == 0.3
self.prediction.add_suggestion(pressagio.predictor.Suggestion(
"Test2", 0.2))
assert self.prediction[0].word == "Test"
assert self.prediction[0].probability == 0.3
assert self.prediction[1].word == "Test2"
assert self.prediction[1].probability == 0.2
self.prediction.add_suggestion(pressagio.predictor.Suggestion(
"Test3", 0.6))
assert self.prediction[0].word == "Test3"
assert self.prediction[0].probability == 0.6
assert self.prediction[1].word == "Test"
assert self.prediction[1].probability == 0.3
assert self.prediction[2].word == "Test2"
assert self.prediction[2].probability == 0.2
self.prediction[:] = []
def test_suggestion_for_token(self):
self.prediction.add_suggestion(pressagio.predictor.Suggestion(
"Token", 0.8))
assert self.prediction.suggestion_for_token("Token").probability == 0.8
self.prediction[:] = []
class StringStreamCallback(pressagio.callback.Callback):
def __init__(self, stream):
pressagio.callback.Callback.__init__(self)
self.stream = stream
class TestSmoothedNgramPredictor():
def setup(self):
self.dbfilename = os.path.abspath(os.path.join(
os.path.dirname( __file__ ), 'test_data', 'test.db'))
self.infile = os.path.abspath(os.path.join(os.path.dirname( __file__ ),
'test_data', 'der_linksdenker.txt'))
for ngram_size in range(3):
ngram_map = pressagio.tokenizer.forward_tokenize_file(
self.infile, ngram_size + 1, False)
pressagio.dbconnector.insert_ngram_map_sqlite(ngram_map, ngram_size + 1,
self.dbfilename, False)
config_file = os.path.abspath(os.path.join(os.path.dirname( __file__ ),
'test_data', 'profile_smoothedngram.ini'))
config = configparser.ConfigParser()
config.read(config_file)
config.set("Database", "database", self.dbfilename)
self.predictor_registry = pressagio.predictor.PredictorRegistry(config)
self.callback = StringStreamCallback("")
context_tracker = pressagio.context_tracker.ContextTracker(
config, self.predictor_registry, self.callback)
def test_predict(self):
predictor = self.predictor_registry[0]
predictions = predictor.predict(6, None)
assert len(predictions) == 6
words = []
for p in predictions:
words.append(p.word)
assert "er" in words
assert "der" in words
assert "die" in words
assert "und" in words
assert "nicht" in words
self.callback.stream="d"
predictions = predictor.predict(6, None)
assert len(predictions) == 6
words = []
for p in predictions:
words.append(p.word)
assert "der" in words
assert "die" in words
assert "das" in words
assert "da" in words
assert "Der" in words
self.callback.stream="de"
predictions = predictor.predict(6, None)
assert len(predictions) == 6
words = []
for p in predictions:
words.append(p.word)
assert "der" in words
assert "Der" in words
assert "dem" in words
assert "den" in words
assert "des" in words
def teardown(self):
if self.predictor_registry[0].db:
self.predictor_registry[0].db.close_database()
del(self.predictor_registry[0])
if os.path.isfile(self.dbfilename):
os.remove(self.dbfilename)

View File

@ -0,0 +1,91 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2009-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://media.cidles.eu/poio/>
# For license information, see LICENSE
from __future__ import absolute_import, unicode_literals
import os
import codecs
import pressagio.tokenizer
class TestForwardTokenizer():
def setup(self):
filename = os.path.abspath(os.path.join(os.path.dirname( __file__ ),
'test_data', 'der_linksdenker.txt'))
self.tokenizer = pressagio.tokenizer.ForwardTokenizer(filename)
def test_reset_stream(self):
self.tokenizer.next_token()
assert self.tokenizer.offset != 0
self.tokenizer.reset_stream()
assert self.tokenizer.offset == 0
def test_count_characters(self):
# TODO: Windows tokenization is different, check why
assert self.tokenizer.count_characters() == 7954
def test_count_tokens(self):
assert self.tokenizer.count_tokens() == 1235
def test_has_more_tokens(self):
assert self.tokenizer.has_more_tokens() == True
def test_next_token(self):
assert self.tokenizer.next_token() == "Der"
self.tokenizer.reset_stream()
def test_is_blankspace(self):
assert self.tokenizer.is_blankspace('\n') == True
assert self.tokenizer.is_blankspace('a') == False
def test_is_separator(self):
assert self.tokenizer.is_separator('"') == True
assert self.tokenizer.is_separator('b') == False
class TestReverseTokenizer():
def setup(self):
filename = os.path.abspath(os.path.join(os.path.dirname( __file__ ),
'test_data', 'der_linksdenker.txt'))
self.tokenizer = pressagio.tokenizer.ReverseTokenizer(filename)
def test_reset_stream(self):
self.tokenizer.next_token()
assert self.tokenizer.offset != self.tokenizer.offend
self.tokenizer.reset_stream()
assert self.tokenizer.offset == self.tokenizer.offend
def test_count_tokens(self):
assert self.tokenizer.count_tokens() == 1235
def test_has_more_tokens(self):
assert self.tokenizer.has_more_tokens() == True
def test_next_token(self):
assert self.tokenizer.next_token() == "Linksdenker"
self.tokenizer.reset_stream()
def test_tokenizers_are_equal():
filename = os.path.abspath(os.path.join(os.path.dirname( __file__ ),
'test_data', 'der_linksdenker.txt'))
reverse_tokenizer = pressagio.tokenizer.ReverseTokenizer(filename)
forward_tokenizer = pressagio.tokenizer.ForwardTokenizer(filename)
forward_tokens = []
reverse_tokens = []
while forward_tokenizer.has_more_tokens():
forward_tokens.append(forward_tokenizer.next_token())
while reverse_tokenizer.has_more_tokens():
reverse_tokens.append(reverse_tokenizer.next_token())
diff = set(forward_tokens) ^ set(reverse_tokens)
assert forward_tokens == reverse_tokens[::-1]
assert len(diff) == 0

View File

@ -0,0 +1,289 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2009-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://media.cidles.eu/poio/>
# For license information, see LICENSE
"""
Several classes to tokenize text.
"""
from __future__ import absolute_import, unicode_literals
import abc
import codecs
import collections
from . import character
class Tokenizer(object):
"""
Base class for all tokenizers.
"""
__metaclass__ = abc.ABCMeta
def __init__(self, stream, blankspaces = character.blankspaces,
separators = character.separators):
"""
Constructor of the Tokenizer base class.
Parameters
----------
stream : str or io.IOBase
The stream to tokenize. Can be a filename or any open IO stream.
blankspaces : str
The characters that represent empty spaces.
separators : str
The characters that separate token units (e.g. word boundaries).
"""
self.separators = separators
self.blankspaces = blankspaces
self.lowercase = False
self.offbeg = 0
self.offset = None
self.offend = None
def is_blankspace(self, char):
"""
Test if a character is a blankspace.
Parameters
----------
char : str
The character to test.
Returns
-------
ret : bool
True if character is a blankspace, False otherwise.
"""
if len(char) > 1:
raise TypeError("Expected a char.")
if char in self.blankspaces:
return True
return False
def is_separator(self, char):
"""
Test if a character is a separator.
Parameters
----------
char : str
The character to test.
Returns
-------
ret : bool
True if character is a separator, False otherwise.
"""
if len(char) > 1:
raise TypeError("Expected a char.")
if char in self.separators:
return True
return False
@abc.abstractmethod
def count_characters(self):
raise NotImplementedError("Method must be implemented")
@abc.abstractmethod
def reset_stream(self):
raise NotImplementedError("Method must be implemented")
@abc.abstractmethod
def count_tokens(self):
raise NotImplementedError("Method must be implemented")
@abc.abstractmethod
def has_more_tokens(self):
raise NotImplementedError("Method must be implemented")
@abc.abstractmethod
def next_token(self):
raise NotImplementedError("Method must be implemented")
@abc.abstractmethod
def progress(self):
raise NotImplementedError("Method must be implemented")
class ForwardTokenizer(Tokenizer):
def __init__(self, stream, blankspaces = character.blankspaces,
separators = character.separators):
Tokenizer.__init__(self, stream, blankspaces, separators)
if not hasattr(stream, 'read'):
stream = codecs.open(stream, "r", "utf-8")
self.text = stream.read()
stream.close()
self.offend = self.count_characters() - 1
self.reset_stream()
def count_tokens(self):
count = 0
while(self.has_more_tokens()):
count += 1
self.next_token()
self.reset_stream()
return count
def count_characters(self):
"""
Counts the number of unicode characters in the IO stream.
"""
return len(self.text)
def has_more_tokens(self):
if self.offset < self.offend:
return True
return False
def next_token(self):
current = self.text[self.offset]
self.offset += 1
token = ""
if self.offset <= self.offend:
while (self.is_blankspace(current) or self.is_separator(current)) \
and self.offset < self.offend:
current = self.text[self.offset]
self.offset += 1
while not self.is_blankspace(current) and not self.is_separator(
current) and self.offset <= self.offend:
if self.lowercase:
current = current.lower()
token += current
current = self.text[self.offset]
self.offset += 1
if self.offset > self.offend:
token += self.text[-1]
return token
def progress(self):
return float(offset)/offend
def reset_stream(self):
self.offset = 0
class ReverseTokenizer(Tokenizer):
def __init__(self, stream, blankspaces = character.blankspaces,
separators = character.separators):
Tokenizer.__init__(self, stream, blankspaces, separators)
if not hasattr(stream, 'read'):
stream = codecs.open(stream, "r", "utf-8")
self.text = stream.read()
stream.close()
self.offend = self.count_characters() - 1
self.offset = self.offend
def count_tokens(self):
curroff = self.offset
self.offset = self.offend
count = 0
while (self.has_more_tokens()):
self.next_token()
count += 1
self.offset = curroff
return count
def count_characters(self):
"""
Counts the number of unicode characters in the IO stream.
"""
return len(self.text)
def has_more_tokens(self):
if (self.offbeg <= self.offset):
return True
else:
return False
def next_token(self):
token = ""
while (self.offbeg <= self.offset) and len(token) == 0:
current = self.text[self.offset]
if (self.offset == self.offend) and (self.is_separator(current) \
or self.is_blankspace(current)):
self.offset -= 1
return token
while (self.is_blankspace(current) or self.is_separator(current)) \
and self.offbeg < self.offset:
self.offset -= 1
if (self.offbeg <= self.offset):
current = self.text[self.offset]
while not self.is_blankspace(current) and not self.is_separator(
current) and self.offbeg <= self.offset:
if self.lowercase:
current = current.lower()
token = current + token
self.offset -= 1
if (self.offbeg <= self.offset):
current = self.text[self.offset]
return token
def progress(self):
return float(self.offend - self.offset) / (self.offend - self.offbeg)
def reset_stream(self):
self.offset = self.offend
def forward_tokenize_file(infile, ngram_size, lowercase=False, cutoff=0):
ngram_map = collections.defaultdict(int)
ngram_list = []
tokenizer = ForwardTokenizer(infile)
tokenizer.lowercase = lowercase
for i in range(ngram_size - 1):
if not tokenizer.has_more_tokens():
break
ngram_list.append(tokenizer.next_token())
while (tokenizer.has_more_tokens()):
token = tokenizer.next_token()
ngram_list.append(token)
ngram_map[tuple(ngram_list)] += 1
ngram_list.pop(0)
ngram_map_tmp = dict()
if cutoff > 0:
for k in ngram_map.keys():
if ngram_map[k] <= cutoff:
del(ngram_map[k])
return ngram_map