Subversion Repositories LCARS

Rev

Rev 298 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
293 PointedEar 1
"""
2
Created on 2014-10-20
3
 
4
@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
5
 
6
"""
7
 
8
from os import chdir, stat
9
from sys import stderr
10
from os.path import dirname, realpath, basename
11
from pickle import dump, load
12
from re import match, DOTALL, search, sub, split, compile
13
 
14
debug_level = 2
15
 
300 PointedEar 16
def dmsg (*args, **kwargs):
17
    if not kwargs.get('file'):
293 PointedEar 18
        kwargs['file'] = stderr
19
 
300 PointedEar 20
    min_level = kwargs.pop('min_level', 1)
21
 
22
    if debug_level >= min_level:
293 PointedEar 23
        print(*args, **kwargs)
24
 
300 PointedEar 25
def sort_dict_alnum_english_key (phrase):
293 PointedEar 26
    return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()
27
 
300 PointedEar 28
class Dictionary (dict):
293 PointedEar 29
    """
297 PointedEar 30
    A Dictionary (not to be confused with its ancestor, dict)
31
    represents a word dictionary stored in a file.
293 PointedEar 32
 
33
    """
296 PointedEar 34
    _language_key = 'en'
293 PointedEar 35
    _keys = "ipa|en|lit|pos|com|tag|ex"
36
    _expressions = {}
37
 
300 PointedEar 38
    def load (self, dictionary_file, keys=None, language_key=None):
297 PointedEar 39
        """
40
        Loads a word dictionary from a file.
41
        :param dictionary_file:
42
        :type dictionary_file:
43
        :param language_key:
44
        :type language_key:
45
        """
300 PointedEar 46
        if keys is not None:
47
            self._keys = keys
296 PointedEar 48
 
300 PointedEar 49
        if language_key is not None:
50
            self._language_key = language_key
51
 
293 PointedEar 52
        dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)
53
 
54
        chdir(dirname(realpath(__file__)))
55
 
56
        pickle_file = basename(dictionary_file) + '.pickle'
57
 
58
        try:
59
            pickle_mtime = stat(pickle_file).st_mtime
60
        except FileNotFoundError:
61
            pickle_mtime = None
62
 
63
        if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
64
            dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
300 PointedEar 65
 
293 PointedEar 66
            phrase = None
67
            key = None
68
            value = None
69
            with open(dictionary_file) as f:
70
                indent = None
71
 
72
                for line in f:
296 PointedEar 73
                    m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(self._language_key), line)
293 PointedEar 74
                    if m is not None:
75
                        phrase = m.group("phrase")
295 PointedEar 76
                        self[phrase] = {}
293 PointedEar 77
                        indent = None
78
                    else:
79
                        m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line)
80
                        if m is not None:
81
                            # join previous value if necessary
82
                            if type(value) == list:
295 PointedEar 83
                                self[phrase][key] = ' '.join(value)
293 PointedEar 84
 
85
                            indent = m.group("indent")
86
                            key = m.group("key")
87
                            value = m.group("value")
88
                            # assign a string for memory efficiency
295 PointedEar 89
                            self[phrase][key] = value
293 PointedEar 90
                        elif indent is not None:
91
                            m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
92
                            if m is not None:
93
                                if len(m.group("indent")) == len(indent) + 2:
94
                                    continuation = m.group("continuation")
95
                                    if type(value) == str:
96
                                        # when a continuation is first found, convert to a list
97
                                        # because there could be more continuations
295 PointedEar 98
                                        value = self[phrase][key] = [value, continuation]
293 PointedEar 99
                                    else:
100
                                        value.append(continuation)
101
 
102
            # join last value if necessary
103
            if type(value) == list:
295 PointedEar 104
                self[phrase][key] = ' '.join(value)
293 PointedEar 105
 
106
            dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
300 PointedEar 107
 
293 PointedEar 108
            # TODO: Pickle should only contain strings to be small
295 PointedEar 109
            with open(pickle_file, mode='wb') as f: dump(self, f)
300 PointedEar 110
 
293 PointedEar 111
            dmsg(' done.', min_level=1)
112
        else:
113
            dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
300 PointedEar 114
 
293 PointedEar 115
            with open(pickle_file, mode='rb') as f: pickle = load(f)
116
            for key, value in pickle.items():
295 PointedEar 117
                self[key] = value
293 PointedEar 118
 
295 PointedEar 119
        dmsg(' done ({0} entries).'.format(len(self)), min_level=1)
293 PointedEar 120
 
121
    def clean (self):
297 PointedEar 122
        """
123
        Cleans dictionary entries
124
        """
296 PointedEar 125
        re_parens = compile(r'\(.+\)', DOTALL)
297 PointedEar 126
        re_parens_no_alt = compile(r'\(([^\|]+)\)', DOTALL)
296 PointedEar 127
        re_braces = compile(
295 PointedEar 128
            r'^\s*\{(?P<phrase>.+)\}(?:\s*\((?P<variant>.+?)\))?\s*$',
129
            DOTALL)
296 PointedEar 130
        re_semicolon = compile(r'\s*;\s*')
293 PointedEar 131
 
295 PointedEar 132
        for orig_phrase, data in list(self.items()):
293 PointedEar 133
            # if there are optional or alternating parts
296 PointedEar 134
            if search(re_parens, orig_phrase):
293 PointedEar 135
                if orig_phrase.find('|') > -1:
136
                    # TODO alternation
137
                    pass
138
                else:
139
                    # TODO optional parts
140
                    pass
141
 
142
            if orig_phrase.find(';') > -1:
143
                synonyms = map(
296 PointedEar 144
                    lambda x: sub(re_braces, r'\1', x),
145
                    split(re_semicolon, orig_phrase))
293 PointedEar 146
 
147
                for synonym in synonyms:
295 PointedEar 148
                    self[synonym] = data
293 PointedEar 149
 
295 PointedEar 150
                del self[orig_phrase]
293 PointedEar 151
            else:
296 PointedEar 152
                m = match(re_braces, orig_phrase)
293 PointedEar 153
                if m is not None:
297 PointedEar 154
                    phrase = m.group('phrase')
155
 
156
                    if callable(getattr(self, 'clean_entry', None)):
157
                        phrase = self.clean_entry(phrase)
158
 
296 PointedEar 159
                    m_parens = search(re_parens, phrase)
160
                    if m_parens is not None:
161
                        # alternation and optional parts
162
                        expr = sub(re_parens_no_alt, r'(?:\1)?', phrase)
163
                        expr = sub('~', '(?=.)', expr)
164
                        self._expressions[expr] = data
165
                    else:
166
                        # remove braces
167
                        self[phrase] = data
295 PointedEar 168
 
169
                    del self[orig_phrase]
296 PointedEar 170
 
171
    def translate (self, phrase):
297 PointedEar 172
        """
173
        Translate a phrase according to this dictionary.
174
        For language-specific processing, this method should be
175
        called/overridden by inheriting classes.
176
        :param phrase:
177
        :type phrase: str
178
        """
296 PointedEar 179
        translation = self.get(phrase.lower(), None)
180
        if translation is not None:
181
            translation[self._language_key] = phrase
182
            return translation
183
 
184
        return None
185
 
186
    def translate_expression (self, phrase):
297 PointedEar 187
        """
188
        Translate a phrase according entries in this dictionary
189
        based on regular expressions.
190
        :param phrase:
191
        :type phrase:
192
        """
193
        for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])):
298 PointedEar 194
            expression_match = match(r'{0}$'.format(expression), phrase)
296 PointedEar 195
            if expression_match is not None:
196
                data[self._language_key] = expression_match.group(0)
197
                return data
198
 
199
        return None