added some of said experimental features

experimental
Wolf Vollprecht 2014-10-02 19:04:22 +02:00
parent 8fdc9f465a
commit 9807741b13
20 changed files with 2729 additions and 0 deletions

1
.gitignore vendored
View File

@ -1,4 +1,5 @@
build/lib.linux-x86_64-2.7
*.pyc
__pycache__/
build/scripts-2.7
build/share

View File

@ -0,0 +1,201 @@
# UberwriterAutoCorrect
# The Uberwriter Auto Correct is a auto correction
# mechanism to prevent stupid typos
# import presage
from gi.repository import Gtk, Gdk
import uberwriter_lib.pressagio as pressagio
import enchant
# d = enchant.Dict("de_DE")
import re
import uberwriter_lib.pressagio.predictor
import uberwriter_lib.pressagio.tokenizer
import uberwriter_lib.pressagio.dbconnector
import uberwriter_lib.pressagio.context_tracker
import uberwriter_lib.pressagio.callback
import xml.etree.ElementTree as ET
import pickle
from Levenshtein import distance
import configparser
from uberwriter_lib.helpers import get_media_path
# Define and create PresageCallback object
class PressagioCallback(pressagio.callback.Callback):
def __init__(self, buffer):
super().__init__()
self.buffer = buffer
def past_stream(self):
return self.buffer
def future_stream(self):
return ''
class UberwriterAutoCorrect:
def show_bubble(self, iter, suggestion):
self.suggestion = suggestion
if self.bubble:
self.bubble_label.set_text(suggestion)
else:
pos = self.TextView.get_iter_location(iter)
pos_adjusted = self.TextView.buffer_to_window_coords(Gtk.TextWindowType.TEXT, pos.x, pos.y + pos.height)
self.bubble = Gtk.Grid.new()
self.bubble.set_name("AutoCorrect")
self.TextView.add_child_in_window(self.bubble, Gtk.TextWindowType.TEXT, pos_adjusted[0], pos_adjusted[1])
self.bubble_label = Gtk.Label.new(suggestion)
self.bubble.attach(self.bubble_label, 0, 0, 1, 1)
close = Gtk.Image.new_from_icon_name('dialog-close', Gtk.IconSize.SMALL_TOOLBAR)
self.bubble.attach(close, 1, 0, 1, 1)
self.bubble.show_all()
def suggest(self, stump, context):
if self.enchant_dict.check(stump):
self.destroy_bubble()
return
self.callback.buffer = ' '.join(context) + ' ' + stump
self.callback.buffer = self.callback.buffer.lstrip().rstrip()
predictions = []
if self.use_pressagio:
predictions = self.prsgio.predict(6, None)
prediction = None
if not len(predictions):
if self.enchant_dict.check(stump):
self.destroy_bubble()
return
predictions = self.enchant_dict.suggest(stump)
suggestions_map = []
for suggestion in predictions:
if suggestion in self.frequency_dict:
suggestions_map.append({'suggestion': suggestion, 'freq': self.frequency_dict[suggestion]})
else:
suggestions_map.append({'suggestion': suggestion, 'freq': 0})
suggestions_map.sort(key= lambda x: x['freq'])
suggestions_map.reverse()
prediction = suggestions_map[0]
print(predictions)
prediction = predictions[0]
else:
prediction = predictions[0].word
anchor_iter = self.buffer.get_iter_at_mark(self.buffer.get_insert())
anchor_iter.backward_visible_word_start()
if len(stump) >= 1:
self.show_bubble(anchor_iter, prediction)
def destroy_bubble(self, *args):
if not self.bubble:
return
self.bubble.destroy()
self.bubble = None
self.suggestion = ''
def get_frequency_dict(self, language):
self.frequency_dict = {}
pp_pickled = get_media_path("frequency_dict_" + self.language + ".pickle")
if pp_pickled and os.path.isfile(pp_pickled):
f = open(pp_pickled, 'rb')
self.frequency_dict = pickle.load(f)
f.close()
else:
pp = get_media_path('wordlists/en_us_wordlist.xml')
frequencies = ET.parse(pp)
root = frequencies.getroot()
for child in root:
self.frequency_dict[child.text] = int(child.attrib['f'])
f = open('pickled_dict', 'wb+')
pickle.dump(self.frequency_dict, f)
f.close()
def accept_suggestion(self, append=""):
print("called")
curr_iter = self.buffer.get_iter_at_mark(self.buffer.get_insert())
start_iter = curr_iter.copy()
start_iter.backward_visible_word_start()
self.buffer.delete(start_iter, curr_iter)
self.buffer.insert_at_cursor(self.suggestion + append)
self.destroy_bubble()
def key_pressed(self, widget, event):
if not self.bubble:
return False
if event.keyval in [Gdk.KEY_Escape, Gdk.KEY_BackSpace]:
self.destroy_bubble()
return False
def text_insert(self, buffer, location,
text, len, data=None):
# check if at end of a word
# if yes, check if suggestion available
# then display suggetion
if self.suggestion and text in [' ', '\t', '\n', '.', '?', '!', ',', ';', '\'', '"', ')', ':']:
self.accept_suggestion(append=text)
location.assign(self.buffer.get_iter_at_mark(self.buffer.get_insert()))
elif location.ends_word():
iter_start = location.copy()
iter_start.backward_visible_word_starts(3)
text = buffer.get_text(iter_start, location, False)
words = text.split()
self.suggest(words[-1], words[0:-1])
def disable(self):
self.disabled = True
def enable(self):
self.disabled = False
def set_language(self, language):
print("Language changed to: %s" % language)
# handle 2 char cases e.g. "en"
if(len(language) == 2):
if "en":
language = "en_US"
if self.language == language:
return
else:
self.language = language
print("Language changing")
config_file = get_media_path("pressagio_config.ini")
pres_config = configparser.ConfigParser()
pres_config.read(config_file)
pres_config.set("Database", "database", get_media_path("corpora/" + self.language + ".sqlite"))
self.context_tracker = pressagio.context_tracker.ContextTracker(
pres_config, self.predictor_registry, self.callback)
self.prsgio = self.predictor_registry[0]
self.enchant_dict = enchant.Dict(self.language)
def __init__(self, textview, textbuffer):
self.TextView = textview
self.buffer = textbuffer
self.suggestion = ""
self.bubble = self.bubble_label = None
self.buffer.connect_after('insert-text', self.text_insert)
self.TextView.connect('key-press-event', self.key_pressed)
self.language = "en_US"
self.frequency_dict = {}
self.get_frequency_dict(self.language)
self.enchant_dict = enchant.Dict(self.language)
self.use_pressagio = False
config_file = get_media_path("pressagio_config.ini")
pres_config = configparser.ConfigParser()
pres_config.read(config_file)
pres_config.set("Database", "database", get_media_path("corpora/" + self.language + ".sqlite"))
self.callback = PressagioCallback("")
self.predictor_registry = pressagio.predictor.PredictorRegistry(pres_config)
self.context_tracker = pressagio.context_tracker.ContextTracker(
pres_config, self.predictor_registry, self.callback)
self.prsgio = self.predictor_registry[0]

View File

@ -0,0 +1 @@
0.1.3

View File

@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2009-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://media.cidles.eu/poio/>
# For license information, see LICENSE
from . import predictor
from . import context_tracker
class Pressagio:
def __init__(self, callback, config, dbconnection = None):
self.config = config
self.callback = callback
self.predictor_registry = pressagio.predictor.PredictorRegistry(
self.config, dbconnection)
self.context_tracker = pressagio.context_tracker.ContextTracker(
self.config, self.predictor_registry, callback)
self.predictor_activator = pressagio.predictor.PredictorActivator(
self.config, self.predictor_registry, self.context_tracker)
self.predictor_activator.combination_policy = "meritocracy"
def predict(self):
multiplier = 1
predictions = self.predictor_activator.predict(multiplier)
return [p.word for p in predictions]
def close_database(self):
self.predictor_registry.close_database()

View File

@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2009-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://media.cidles.eu/poio/>
# For license information, see LICENSE
"""
Base class for callbacks.
"""
from __future__ import absolute_import, unicode_literals
class Callback(object):
"""
Base class for callbacks.
"""
def __init__(self):
self.stream = ""
self.empty = ""
def past_stream(self):
return self.stream
def future_stream(self):
return self.empty
def update(self, character):
if character == "\b" and len(stream) > 0:
self.stream[:-1]
else:
self.stream += character

View File

@ -0,0 +1,34 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2009-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://media.cidles.eu/poio/>
# For license information, see LICENSE
from __future__ import absolute_import, unicode_literals
import unicodedata
blankspaces = " \f\n\r\t\v…"
separators = "`~!@#$%^&*()_-+=\\|]}[{'\";:/?.>,<†„“।॥ו´י0123456789"
def first_word_character(string):
for i, ch in enumerate(string):
if is_word_character(ch):
return i
return -1
def last_word_character(string):
result = first_word_character(string[::-1])
if result == -1:
return -1
return len(string) - result - 1
def is_word_character(char):
# check for letter category
if unicodedata.category(char)[0] == "L":
return True
return False

View File

@ -0,0 +1,64 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2009-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://media.cidles.eu/poio/>
# For license information, see LICENSE
"""
Combiner classes to merge results from several predictors.
"""
from __future__ import absolute_import, unicode_literals
import abc
from . import predictor
class Combiner(object):
"""
Base class for all combiners
"""
__metaclass__ = abc.ABCMeta
def __init__(self):
pass
def filter(self, prediction):
seen_tokens = set()
result = predictor.Prediction()
for i, suggestion in enumerate(prediction):
token = suggestion.word
if token not in seen_tokens:
for j in range(i+1, len(prediction)):
if token == prediction[j].word:
# TODO: interpolate here?
suggestion.probability += prediction[j].probability
if suggestion.probability > \
predictor.MAX_PROBABILITY:
suggestion.probability = \
MAX_PROBABILITY
seen_tokens.add(token)
result.add_suggestion(suggestion)
return result
@abc.abstractmethod
def combine(self):
raise NotImplementedError("Method must be implemented")
class MeritocracyCombiner(Combiner):
def __init__(self):
pass
def combine(self, predictions):
result = predictor.Prediction()
for prediction in predictions:
for suggestion in prediction:
result.add_suggestion(suggestion)
return(self.filter(result))

View File

@ -0,0 +1,177 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2009-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://media.cidles.eu/poio/>
# For license information, see LICENSE
"""
Class for context tracker.
"""
from __future__ import absolute_import, unicode_literals
import copy
import io
from . import character
from . import observer
from . import tokenizer
DEFAULT_SLIDING_WINDOW_SIZE = 80
class InvalidCallbackException(Exception): pass
class ContextChangeDetector(object):
def __init__(self, lowercase):
self.lowercase = lowercase
self.sliding_windows_size = DEFAULT_SLIDING_WINDOW_SIZE
self.sliding_window = ""
def update_sliding_window(self, string):
if len(string) <= self.sliding_windows_size:
self.sliding_window = string
else:
self.sliding_window = string[:-self.sliding_windows_size]
def context_change(self, past_stream):
# rename for clarity
prev_context = self.sliding_window
curr_context = past_stream
if len(prev_context) == 0:
if len(curr_context) == 0:
return False
else:
return True
ctx_idx = curr_context.rfind(prev_context)
if ctx_idx == -1:
return True
remainder = curr_context[ctx_idx + len(prev_context):]
idx = character.last_word_character(remainder)
if idx == -1:
if len(remainder) == 0:
return False
last_char = curr_context[ctx_idx + len(prev_context) - 1]
if character.is_word_character(last_char):
return False
else:
return True
if idx == len(remainder) - 1:
return False
return True
def change(self, past_stream):
# rename for clarity
prev_context = self.sliding_window
curr_context = past_stream
if len(prev_context) == 0:
return past_stream
ctx_idx = curr_context.rfind(prev_context)
if ctx_idx == -1:
return past_stream
result = curr_context[ctx_idx + len(prev_context):]
if (self.context_change(past_stream)):
sliding_window_stream = self.sliding_window
r_tok = tokenizer.ReverseTokenizer(sliding_window_stream)
r_tok.lowercase = self.lowercase
first_token = r_tok.next_token()
if not len(first_token) == 0:
result = first_token + result
return result
class ContextTracker(object): #observer.Observer
"""
Tracks the current context.
"""
def __init__(self, config, predictor_registry, callback):
#self.dispatcher = observer.Dispatcher(self)
self.config = config
self.lowercase = self.config.getboolean("ContextTracker", "lowercase_mode")
self.registry = predictor_registry
if callback:
self.callback = callback
else:
raise InvalidCallbackException
self.context_change_detector = ContextChangeDetector(self.lowercase)
self.registry.context_tracker = self
self.sliding_windows_size = DEFAULT_SLIDING_WINDOW_SIZE
def context_change(self):
return self.context_change_detector.context_change(self.past_stream())
def update_context(self):
change = self.context_change_detector.change(self.past_stream())
tok = tokenizer.ForwardTokenizer(change)
tok.lowercase = self.lowercase
change_tokens = []
while(tok.has_more_tokens()):
token = tok.next_token()
change_tokens.append(token)
if len(change_tokens) != 0:
# remove prefix (partially entered token or empty token)
change_tokens.pop()
for predictor in self.predictor_registry:
predictor.learn(change_tokens)
self.context_change_detector.update_sliding_window(self.past_stream())
def prefix(self):
self.token(0)
def token(self, index):
past_string_stream = self.past_stream()
string_io = io.StringIO(past_string_stream)
tok = tokenizer.ReverseTokenizer(string_io)
tok.lowercase = self.lowercase
i = 0
while tok.has_more_tokens() and i <= index:
token = tok.next_token()
i += 1
if i <= index:
token = ""
return token
def extra_token_to_learn(self, index, change):
return self.token(index + len(change))
def future_stream(self):
return self.callback.future_stream()
def past_stream(self):
return self.callback.past_stream()
def is_completion_valid(self, completion):
prefix = self.prefix().lower()
if prefix in completion:
return True
return False
def __repr__(self):
return self.callback.past_stream + "<|>" + self.callback.future_stream \
+ "\n"
# def update(self, observable):
# self.dispatcher.dispatch(observable)

View File

@ -0,0 +1,745 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2001-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://www.cidles.eu/ltll/poio>
# For license information, see LICENSE
"""
Classes to connect to databases.
"""
from __future__ import absolute_import, unicode_literals
import abc
import sqlite3
import time
import re
import regex
try:
import psycopg2
psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)
psycopg2.extensions.register_type(psycopg2.extensions.UNICODEARRAY)
except ImportError:
pass
re_escape_singlequote = re.compile("'")
def _sqlite3_regex(expr, item):
return (not (not regex.search(expr, item)))
class DatabaseConnector(object):
"""
Base class for all database connectors.
"""
__metaclass__ = abc.ABCMeta
def __init__(self, dbname, cardinality = 1):
"""
Constructor of the base class DababaseConnector.
Parameters
----------
dbname : str
path to the database file or database name
cardinality : int
default cardinality for n-grams
"""
print("asjdas jdlkasj ljsa kdj lsakdj lk")
self.cardinality = cardinality
self.dbname = dbname
self.lowercase = False
self.normalize = False
def create_ngram_table(self, cardinality):
"""
Creates a table for n-gram of a give cardinality. The table name is
constructed from this parameter, for example for cardinality `2` there
will be a table `_2_gram` created.
Parameters
----------
cardinality : int
The cardinality to create a table for.
"""
query = "CREATE TABLE IF NOT EXISTS _{0}_gram (".format(cardinality)
unique = ""
for i in reversed(range(cardinality)):
if i != 0:
unique += "word_{0}, ".format(i)
query += "word_{0} TEXT, ".format(i)
else:
unique += "word"
query += "word TEXT, count INTEGER, UNIQUE({0}) );".format(
unique)
self.execute_sql(query)
def delete_ngram_table(self, cardinality):
"""
Deletes the table for n-gram of a give cardinality. The table name is
constructed from this parameter, for example for cardinality `2` there
will be a table `_2_gram` deleted.
Parameters
----------
cardinality : int
The cardinality of the table to delete.
"""
query = "DROP TABLE IF EXISTS _{0}_gram;".format(cardinality)
self.execute_sql(query)
def create_index(self, cardinality):
"""
Create an index for the table with the given cardinality.
Parameters
----------
cardinality : int
The cardinality to create a index for.
"""
for i in reversed(range(cardinality)):
if i != 0:
query = "CREATE INDEX idx_{0}_gram_{1} ON _{0}_gram(word_{1});".format(cardinality, i)
self.execute_sql(query)
def delete_index(self, cardinality):
"""
Delete index for the table with the given cardinality.
Parameters
----------
cardinality : int
The cardinality of the index to delete.
"""
for i in reversed(range(cardinality)):
if i != 0:
query = "DROP INDEX IF EXISTS idx_{0}_gram_{1};".format(
cardinality, i)
self.execute_sql(query)
def create_unigram_table(self):
"""
Creates a table for n-grams of cardinality 1.
"""
self.create_ngram_table(1)
def create_bigram_table(self):
"""
Creates a table for n-grams of cardinality 2.
"""
self.create_ngram_table(2)
def create_trigram_table(self):
"""
Creates a table for n-grams of cardinality 3.
"""
self.create_ngram_table(3)
def ngrams(self, with_counts=False):
"""
Returns all ngrams that are in the table.
Parameters
----------
None
Returns
-------
ngrams : generator
A generator for ngram tuples.
"""
query = "SELECT "
for i in reversed(range(self.cardinality)):
if i != 0:
query += "word_{0}, ".format(i)
elif i == 0:
query += "word"
if with_counts:
query += ", count"
query += " FROM _{0}_gram;".format(self.cardinality)
print(query)
result = self.execute_sql(query)
for row in result:
yield tuple(row)
def unigram_counts_sum(self):
query = "SELECT SUM(count) from _1_gram;"
result = self.execute_sql(query)
print(result, query)
return self._extract_first_integer(result)
def ngram_count(self, ngram):
"""
Gets the count for a given ngram from the database.
Parameters
----------
ngram : iterable of str
A list, set or tuple of strings.
Returns
-------
count : int
The count of the ngram.
"""
query = "SELECT count FROM _{0}_gram".format(len(ngram))
query += self._build_where_clause(ngram)
query += ";"
result = self.execute_sql(query)
return self._extract_first_integer(result)
def ngram_like_table(self, ngram, limit = -1):
print("NGRAM LIKE TABLE!\n\n\n")
query = "SELECT {0} FROM _{1}_gram {2} ORDER BY count DESC".format(
self._build_select_like_clause(len(ngram)), len(ngram),
self._build_where_like_clause(ngram))
print(query)
if limit < 0:
query += ";"
else:
query += " LIMIT {0};".format(limit)
return self.execute_sql(query)
def ngram_like_table_filtered(self, ngram, filter, limit = -1):
pass
def increment_ngram_count(self, ngram):
pass
def insert_ngram(self, ngram, count):
"""
Inserts a given n-gram with count into the database.
Parameters
----------
ngram : iterable of str
A list, set or tuple of strings.
count : int
The count for the given n-gram.
"""
query = "INSERT INTO _{0}_gram {1};".format(len(ngram),
self._build_values_clause(ngram, count))
self.execute_sql(query)
def update_ngram(self, ngram, count):
"""
Updates a given ngram in the database. The ngram has to be in the
database, otherwise this method will stop with an error.
Parameters
----------
ngram : iterable of str
A list, set or tuple of strings.
count : int
The count for the given n-gram.
"""
query = "UPDATE _{0}_gram SET count = {1}".format(len(ngram), count)
query += self._build_where_clause(ngram)
query += ";"
self.execute_sql(query)
def remove_ngram(self, ngram):
"""
Removes a given ngram from the databae. The ngram has to be in the
database, otherwise this method will stop with an error.
Parameters
----------
ngram : iterable of str
A list, set or tuple of strings.
"""
query = "DELETE FROM _{0}_gram".format(len(ngram))
query += self._build_where_clause(ngram)
query += ";"
self.execute_sql(query)
def open_database(self):
raise NotImplementedError("Method must be implemented")
def close_database(self):
raise NotImplementedError("Method must be implemented")
def execute_sql(self):
raise NotImplementedError("Method must be implemented")
############################################### Private methods
def _build_values_clause(self, ngram, count):
ngram_escaped = []
for n in ngram:
ngram_escaped.append(re_escape_singlequote.sub("''", n))
values_clause = "VALUES('"
values_clause += "', '".join(ngram_escaped)
values_clause += "', {0})".format(count)
return values_clause
def _build_where_clause(self, ngram):
where_clause = " WHERE"
for i in range(len(ngram)):
n = re_escape_singlequote.sub("''", ngram[i])
if i < (len(ngram) - 1):
where_clause += " word_{0} = '{1}' AND".format(
len(ngram) - i - 1, n)
else:
pattern = '(?:^%s){e<=%d}' % (n, 2)
where_clause += " word = '{0}'".format(n)
print(where_clause)
return where_clause
def _build_select_like_clause(self, cardinality):
result = ""
for i in reversed(range(cardinality)):
if i != 0:
result += "word_{0}, ". format(i)
else:
result += "word, count"
return result
def _build_where_like_clause(self, ngram):
where_clause = " WHERE"
for i in range(len(ngram)):
if i < (len(ngram) - 1):
where_clause += " word_{0} = '{1}' AND".format(
len(ngram) - i - 1, ngram[i])
else:
pattern = '(?:%s){e<=%d}' % (ngram[-1], 0)
where_clause += " (word regexp '%s')" % pattern
return where_clause
def _extract_first_integer(self, table):
count = 0
if len(table) > 0:
if len(table[0]) > 0:
count = int(table[0][0])
if not count > 0:
count = 0
return count
class SqliteDatabaseConnector(DatabaseConnector):
"""
Database connector for sqlite databases.
"""
def __init__(self, dbname, cardinality = 1):
"""
Constructor for the sqlite database connector.
Parameters
----------
dbname : str
path to the database file
cardinality : int
default cardinality for n-grams
"""
DatabaseConnector.__init__(self, dbname, cardinality)
self.con = None
self.open_database()
def commit(self):
"""
Sends a commit to the database.
"""
self.con.commit()
def open_database(self):
"""
Opens the sqlite database.
"""
self.con = sqlite3.connect(self.dbname)
self.con.create_function("regexp", 2, _sqlite3_regex)
def close_database(self):
"""
Closes the sqlite database.
"""
if self.con:
self.con.close()
def execute_sql(self, query):
"""
Executes a given query string on an open sqlite database.
"""
c = self.con.cursor()
c.execute(query)
result = c.fetchall()
return result
class PostgresDatabaseConnector(DatabaseConnector):
"""
Database connector for postgres databases.
"""
def __init__(self, dbname, cardinality = 1, host = "localhost", port = 5432,
user = "postgres", password = None, connection = None):
"""
Constructor for the postgres database connector.
Parameters
----------
dbname : str
the database name
cardinality : int
default cardinality for n-grams
host : str
hostname of the postgres database
port : int
port number of the postgres database
user : str
user name for the postgres database
password: str
user password for the postgres database
connection : connection
an open database connection
"""
DatabaseConnector.__init__(self, dbname, cardinality)
self.con = connection
self.host = host
self.port = port
self.user = user
self.password = password
def create_database(self):
"""
Creates an empty database if not exists.
"""
if not self._database_exists():
con = psycopg2.connect(host=self.host, database="postgres",
user=self.user, password=self.password, port=self.port)
con.set_isolation_level(
psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
query = "CREATE DATABASE {0};".format(self.dbname)
c = con.cursor()
c.execute(query)
con.close()
if self.normalize:
self.open_database()
query = "CREATE EXTENSION IF NOT EXISTS \"plperlu\";"
self.execute_sql(query)
# query = """CREATE OR REPLACE FUNCTION normalize(str text)
#RETURNS text
#AS $$
#import unicodedata
#return ''.join(c for c in unicodedata.normalize('NFKD', str)
#if unicodedata.category(c) != 'Mn')
#$$ LANGUAGE plpython3u IMMUTABLE;"""
# query = """CREATE OR REPLACE FUNCTION normalize(mystr text)
# RETURNS text
# AS $$
# from unidecode import unidecode
# return unidecode(mystr.decode("utf-8"))
# $$ LANGUAGE plpythonu IMMUTABLE;"""
query = """CREATE OR REPLACE FUNCTION normalize(text)
RETURNS text
AS $$
use Text::Unidecode;
return unidecode(shift);
$$ LANGUAGE plperlu IMMUTABLE;"""
self.execute_sql(query)
self.commit()
self.close_database()
def reset_database(self):
"""
Re-create an empty database.
"""
if self._database_exists():
con = psycopg2.connect(host=self.host, database="postgres",
user=self.user, password=self.password, port=self.port)
con.set_isolation_level(
psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
query = "DROP DATABASE {0};".format(self.dbname)
c = con.cursor()
c.execute(query)
con.close()
self.create_database()
def create_index(self, cardinality):
"""
Create an index for the table with the given cardinality.
Parameters
----------
cardinality : int
The cardinality to create a index for.
"""
DatabaseConnector.create_index(self, cardinality)
query = "CREATE INDEX idx_{0}_gram_varchar ON _{0}_gram(word varchar_pattern_ops);".format(cardinality)
self.execute_sql(query)
if self.lowercase:
for i in reversed(range(cardinality)):
if i != 0:
query = "CREATE INDEX idx_{0}_gram_{1}_lower ON _{0}_gram(LOWER(word_{1}));".format(cardinality, i)
self.execute_sql(query)
if self.normalize:
query = "CREATE INDEX idx_{0}_gram_lower_normalized_varchar ON _{0}_gram(NORMALIZE(LOWER(word)) varchar_pattern_ops);".format(cardinality)
self.execute_sql(query)
else:
query = "CREATE INDEX idx_{0}_gram_lower_varchar ON _{0}_gram(LOWER(word) varchar_pattern_ops);".format(cardinality)
self.execute_sql(query)
elif self.normalize:
query = "CREATE INDEX idx_{0}_gram_normalized_varchar ON _{0}_gram(NORMALIZE(word) varchar_pattern_ops);".format(cardinality)
self.execute_sql(query)
def delete_index(self, cardinality):
"""
Delete index for the table with the given cardinality.
Parameters
----------
cardinality : int
The cardinality of the index to delete.
"""
DatabaseConnector.delete_index(self, cardinality)
query = "DROP INDEX IF EXISTS idx_{0}_gram_varchar;".format(cardinality)
self.execute_sql(query)
query = "DROP INDEX IF EXISTS idx_{0}_gram_normalized_varchar;".format(
cardinality)
self.execute_sql(query)
query = "DROP INDEX IF EXISTS idx_{0}_gram_lower_varchar;".format(
cardinality)
self.execute_sql(query)
query = "DROP INDEX IF EXISTS idx_{0}_gram_lower_normalized_varchar;".\
format(cardinality)
self.execute_sql(query)
for i in reversed(range(cardinality)):
if i != 0:
query = "DROP INDEX IF EXISTS idx_{0}_gram_{1}_lower;".format(
cardinality, i)
self.execute_sql(query)
def commit(self):
"""
Sends a commit to the database.
"""
self.con.commit()
def open_database(self):
"""
Opens the sqlite database.
"""
if not self.con:
try:
self.con = psycopg2.connect(host=self.host,
database=self.dbname, user=self.user,
password=self.password, port=self.port)
except psycopg2.Error as e:
print("Error while opening database:")
print(e.pgerror)
def close_database(self):
"""
Closes the sqlite database.
"""
if self.con:
self.con.close()
self.con = None
def execute_sql(self, query):
"""
Executes a given query string on an open postgres database.
"""
c = self.con.cursor()
c.execute(query)
result = []
if c.rowcount > 0:
try:
result = c.fetchall()
except psycopg2.ProgrammingError:
pass
return result
############################################### Private methods
def _database_exists(self):
"""
Check if the database exists.
"""
con = psycopg2.connect(host=self.host, database="postgres",
user=self.user, password=self.password, port=self.port)
query_check = "select datname from pg_catalog.pg_database"
query_check += " where datname = '{0}';".format(self.dbname)
c = con.cursor()
c.execute(query_check)
result = c.fetchall()
if len(result) > 0:
return True
return False
def _build_where_like_clause(self, ngram):
where_clause = " WHERE"
for i in range(len(ngram)):
if i < (len(ngram) - 1):
if self.lowercase:
where_clause += " LOWER(word_{0}) = LOWER('{1}') AND".format(
len(ngram) - i - 1, ngram[i])
else:
where_clause += " word_{0} = '{1}' AND".format(
len(ngram) - i - 1, ngram[i])
else:
if ngram[-1] != "":
if self.lowercase:
if self. normalize:
where_clause += " NORMALIZE(LOWER(word)) LIKE NORMALIZE(LOWER('{0}%'))".format(ngram[-1])
else:
where_clause += " LOWER(word) LIKE LOWER('{0}%')".format(ngram[-1])
elif self.normalize:
where_clause += " NORMALIZE(word) LIKE NORMALIZE('{0}%')".format(ngram[-1])
else:
where_clause += " word LIKE '{0}%'".format(ngram[-1])
else:
# remove the " AND"
where_clause = where_clause[:-4]
return where_clause
#################################################### Functions
def insert_ngram_map_sqlite(ngram_map, ngram_size, outfile, append=False,
create_index=False):
sql = SqliteDatabaseConnector(outfile, ngram_size)
sql.create_ngram_table(ngram_size)
for ngram, count in ngram_map.items():
if append:
old_count = sql.ngram_count(ngram)
if old_count > 0:
sql.update_ngram(ngram, old_count + count)
else:
sql.insert_ngram(ngram, count)
else:
sql.insert_ngram(ngram, count)
sql.commit()
if create_index and not append:
sql.create_index(ngram_size)
sql.close_database()
def insert_ngram_map_postgres(ngram_map, ngram_size, dbname, append=False,
create_index=False, host = "localhost", port = 5432, user = "postgres",
password = None, lowercase = False, normalize = False):
sql = PostgresDatabaseConnector(dbname, ngram_size, host, port, user,
password)
sql.lowercase = lowercase
sql.normalize = normalize
sql.create_database()
sql.open_database()
if not append:
sql.delete_index(ngram_size)
sql.delete_ngram_table(ngram_size)
sql.create_ngram_table(ngram_size)
for ngram, count in ngram_map.items():
if append:
old_count = sql.ngram_count(ngram)
if old_count > 0:
sql.update_ngram(ngram, old_count + count)
else:
sql.insert_ngram(ngram, count)
else:
sql.insert_ngram(ngram, count)
sql.commit()
if create_index and not append:
sql.create_index(ngram_size)
sql.commit()
sql.close_database()
def _filter_ngrams(sql, dictionary):
for ngram in sql.ngrams():
delete_ngram = False
for word in ngram:
if not word in dictionary:
delete_ngram = True
if delete_ngram:
sql.remove_ngram(ngram)
def filter_ngrams_sqlite(dictionary, ngram_size, outfile):
sql = SqliteDatabaseConnector(outfile, ngram_size)
_filter_ngrams(sql, dictionary)
sql.commit()
sql.close_database()
def filter_ngrams_postgres(dictionary, ngram_size, dbname, host = "localhost",
port = 5432, user = "postgres", password = None):
sql = PostgresDatabaseConnector(dbname, ngram_size, host, port, user,
password)
sql.open_database()
_filter_ngrams(sql, dictionary)
sql.commit()
sql.close_database()

View File

@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2009-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://media.cidles.eu/poio/>
# For license information, see LICENSE
from __future__ import absolute_import, unicode_literals
import abc
class Observer(object):
"""
Base class for classes that want to observer other classes, e.g. the
PredictorActivator.
"""
__metaclass__ = abc.ABCMeta
@abc.abstractmethod
def update(self, observable):
raise NotImplementedError("Method must be implemented")
class Oberservable(object):
"""
Base class for everything that needs observation, e.g. the predictors.
"""
def __init__(self):
self._observers = []
def attach(self, observer):
if not observer in self._observers:
self._observers.append(observer)
def detach(self, observer):
try:
self._observers.remove(observer)
except ValueError:
pass
def notify(self, modifier=None):
for observer in self._observers:
if modifier != observer:
observer.update(self)
class Dispatcher(object):
"""
Dispatches observable notifications.
"""
def __init__(self, obj):
self.observables = []
self.dispatch_dict = {}
self.obj = obj
def map(self, observable, func):
observable.attach(obj)
self.observables.append(observable)
self.dispatch_dict[observable] = func
self.dispatch(observable)
def dispatch(self, observable):
handler_func = self.dispatch_dict[observable]
handler_func(observable)

View File

@ -0,0 +1,425 @@
# -*- coding: utf-8 -*-
#
# Poio Tools for Linguists
#
# Copyright (C) 2009-2013 Poio Project
# Author: Peter Bouda <pbouda@cidles.eu>
# URL: <http://media.cidles.eu/poio/>
# For license information, see LICENSE
"""
Classes for predictors and to handle suggestions and predictions.
"""
from __future__ import absolute_import, unicode_literals
import os
try:
import configparser
except ImportError:
import ConfigParser as configparser
from . import dbconnector
#import pressagio.observer
MIN_PROBABILITY = 0.0
MAX_PROBABILITY = 1.0
class SuggestionException(Exception): pass
class UnknownCombinerException(Exception): pass
class PredictorRegistryException(Exception): pass
class Suggestion(object):
"""
Class for a simple suggestion, consists of a string and a probility for that
string.
"""
def __init__(self, word, probability):
print("I am a suggetsion")
self.word = word
self._probability = probability
def __eq__(self, other):
if self.word == other.word and self.probability == other.probability:
return True
return False
def __lt__(self, other):
if self.probability < other.probability:
return True
if self.probability == other.probability:
return self.word < other.word
return False
def __repr__(self):
return "Word: {0} - Probability: {1}".format(
self.word, self.probability)
def probability():
doc = "The probability property."
def fget(self):
return self._probability
def fset(self, value):
if value < MIN_PROBABILITY or value > MAX_PROBABILITY:
raise SuggestionException("Probability is too high or too low.")
self._probability = value
def fdel(self):
del self._probability
return locals()
probability = property(**probability())
class Prediction(list):
"""
Class for predictions from predictors.
"""
def __init__(self):
pass
def __eq__(self, other):
if self is other:
return True
if len(self) != len(other):
return False
for i, s in enumerate(other):
if not s == self[i]:
return False
return True
def suggestion_for_token(self, token):
for s in self:
if s.word == token:
return s
def add_suggestion(self, suggestion):
if len(self) == 0:
self.append(suggestion)
else:
i = 0
while i < len(self) and suggestion < self[i]:
i += 1
self.insert(i, suggestion)
class PredictorActivator(object):
"""
PredictorActivator starts the execution of the active predictors,
monitors their execution and collects the predictions returned, or
terminates a predictor's execution if it execedes its maximum
prediction time.
The predictions returned by the individual predictors are combined
into a single prediction by the active Combiner.
"""
def __init__(self, config, registry, context_tracker):
self.config = config
self.registry = registry
self.context_tracker = context_tracker
#self.dispatcher = pressagio.observer.Dispatcher(self)
self.predictions = []