Subversion Repositories LCARS

Rev

Rev 298 | Only display areas with differences | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 298 Rev 300
1
"""
1
"""
2
Created on 2014-10-20
2
Created on 2014-10-20
3

3

4
@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
4
@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
5

5

6
"""
6
"""
7
7
8
from os import chdir, stat
8
from os import chdir, stat
9
from sys import stderr
9
from sys import stderr
10
from os.path import dirname, realpath, basename
10
from os.path import dirname, realpath, basename
11
from pickle import dump, load
11
from pickle import dump, load
12
from re import match, DOTALL, search, sub, split, compile
12
from re import match, DOTALL, search, sub, split, compile
13
13
14
debug_level = 2
14
debug_level = 2
15
15
16
def dmsg(*args, **kwargs):
16
def dmsg (*args, **kwargs):
17
    if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
-
 
18
        kwargs['min_level'] = 1
-
 
19
-
 
20
    if not hasattr(kwargs, 'file'):
17
    if not kwargs.get('file'):
21
        kwargs['file'] = stderr
18
        kwargs['file'] = stderr
22
19
23
    if debug_level >= kwargs['min_level']:
20
    min_level = kwargs.pop('min_level', 1)
-
 
21
24
        del kwargs['min_level']
22
    if debug_level >= min_level:
25
        print(*args, **kwargs)
23
        print(*args, **kwargs)
26
24
27
def sort_dict_alnum_english_key(phrase):
25
def sort_dict_alnum_english_key (phrase):
28
    return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()
26
    return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()
29
27
30
class Dictionary(dict):
28
class Dictionary (dict):
31
    """
29
    """
32
    A Dictionary (not to be confused with its ancestor, dict)
30
    A Dictionary (not to be confused with its ancestor, dict)
33
    represents a word dictionary stored in a file.
31
    represents a word dictionary stored in a file.
34
   
32
   
35
    """
33
    """
36
    _language_key = 'en'
34
    _language_key = 'en'
37
    _keys = "ipa|en|lit|pos|com|tag|ex"
35
    _keys = "ipa|en|lit|pos|com|tag|ex"
38
    _expressions = {}
36
    _expressions = {}
39
37
40
    def load (self, dictionary_file, language_key='en'):
38
    def load (self, dictionary_file, keys=None, language_key=None):
41
        """
39
        """
42
        Loads a word dictionary from a file.
40
        Loads a word dictionary from a file.
43
        :param dictionary_file:
41
        :param dictionary_file:
44
        :type dictionary_file:
42
        :type dictionary_file:
45
        :param language_key:
43
        :param language_key:
46
        :type language_key:
44
        :type language_key:
47
        """
45
        """
-
 
46
        if keys is not None:
-
 
47
            self._keys = keys
-
 
48
-
 
49
        if language_key is not None:
48
        self._language_key = language_key
50
            self._language_key = language_key
49
51
50
        dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)
52
        dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)
51
53
52
        chdir(dirname(realpath(__file__)))
54
        chdir(dirname(realpath(__file__)))
53
55
54
        pickle_file = basename(dictionary_file) + '.pickle'
56
        pickle_file = basename(dictionary_file) + '.pickle'
55
57
56
        try:
58
        try:
57
            pickle_mtime = stat(pickle_file).st_mtime
59
            pickle_mtime = stat(pickle_file).st_mtime
58
        except FileNotFoundError:
60
        except FileNotFoundError:
59
            pickle_mtime = None
61
            pickle_mtime = None
60
62
61
        if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
63
        if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
62
            dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
64
            dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
-
 
65
63
            phrase = None
66
            phrase = None
64
            key = None
67
            key = None
65
            value = None
68
            value = None
66
            with open(dictionary_file) as f:
69
            with open(dictionary_file) as f:
67
                indent = None
70
                indent = None
68
71
69
                for line in f:
72
                for line in f:
70
                    m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(self._language_key), line)
73
                    m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(self._language_key), line)
71
                    if m is not None:
74
                    if m is not None:
72
                        phrase = m.group("phrase")
75
                        phrase = m.group("phrase")
73
                        self[phrase] = {}
76
                        self[phrase] = {}
74
                        indent = None
77
                        indent = None
75
                    else:
78
                    else:
76
                        m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line)
79
                        m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line)
77
                        if m is not None:
80
                        if m is not None:
78
                            # join previous value if necessary
81
                            # join previous value if necessary
79
                            if type(value) == list:
82
                            if type(value) == list:
80
                                self[phrase][key] = ' '.join(value)
83
                                self[phrase][key] = ' '.join(value)
81
84
82
                            indent = m.group("indent")
85
                            indent = m.group("indent")
83
                            key = m.group("key")
86
                            key = m.group("key")
84
                            value = m.group("value")
87
                            value = m.group("value")
85
                            # assign a string for memory efficiency
88
                            # assign a string for memory efficiency
86
                            self[phrase][key] = value
89
                            self[phrase][key] = value
87
                        elif indent is not None:
90
                        elif indent is not None:
88
                            m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
91
                            m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
89
                            if m is not None:
92
                            if m is not None:
90
                                if len(m.group("indent")) == len(indent) + 2:
93
                                if len(m.group("indent")) == len(indent) + 2:
91
                                    continuation = m.group("continuation")
94
                                    continuation = m.group("continuation")
92
                                    if type(value) == str:
95
                                    if type(value) == str:
93
                                        # when a continuation is first found, convert to a list
96
                                        # when a continuation is first found, convert to a list
94
                                        # because there could be more continuations
97
                                        # because there could be more continuations
95
                                        value = self[phrase][key] = [value, continuation]
98
                                        value = self[phrase][key] = [value, continuation]
96
                                    else:
99
                                    else:
97
                                        value.append(continuation)
100
                                        value.append(continuation)
98
101
99
            # join last value if necessary
102
            # join last value if necessary
100
            if type(value) == list:
103
            if type(value) == list:
101
                self[phrase][key] = ' '.join(value)
104
                self[phrase][key] = ' '.join(value)
102
105
103
            dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
106
            dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
-
 
107
104
            # TODO: Pickle should only contain strings to be small
108
            # TODO: Pickle should only contain strings to be small
105
            with open(pickle_file, mode='wb') as f: dump(self, f)
109
            with open(pickle_file, mode='wb') as f: dump(self, f)
-
 
110
106
            dmsg(' done.', min_level=1)
111
            dmsg(' done.', min_level=1)
107
        else:
112
        else:
108
            dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
113
            dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
-
 
114
109
            with open(pickle_file, mode='rb') as f: pickle = load(f)
115
            with open(pickle_file, mode='rb') as f: pickle = load(f)
110
            for key, value in pickle.items():
116
            for key, value in pickle.items():
111
                self[key] = value
117
                self[key] = value
112
118
113
        dmsg(' done ({0} entries).'.format(len(self)), min_level=1)
119
        dmsg(' done ({0} entries).'.format(len(self)), min_level=1)
114
120
115
    def clean (self):
121
    def clean (self):
116
        """
122
        """
117
        Cleans dictionary entries
123
        Cleans dictionary entries
118
        """
124
        """
119
        re_parens = compile(r'\(.+\)', DOTALL)
125
        re_parens = compile(r'\(.+\)', DOTALL)
120
        re_parens_no_alt = compile(r'\(([^\|]+)\)', DOTALL)
126
        re_parens_no_alt = compile(r'\(([^\|]+)\)', DOTALL)
121
        re_braces = compile(
127
        re_braces = compile(
122
            r'^\s*\{(?P<phrase>.+)\}(?:\s*\((?P<variant>.+?)\))?\s*$',
128
            r'^\s*\{(?P<phrase>.+)\}(?:\s*\((?P<variant>.+?)\))?\s*$',
123
            DOTALL)
129
            DOTALL)
124
        re_semicolon = compile(r'\s*;\s*')
130
        re_semicolon = compile(r'\s*;\s*')
125
131
126
        for orig_phrase, data in list(self.items()):
132
        for orig_phrase, data in list(self.items()):
127
            # if there are optional or alternating parts
133
            # if there are optional or alternating parts
128
            if search(re_parens, orig_phrase):
134
            if search(re_parens, orig_phrase):
129
                if orig_phrase.find('|') > -1:
135
                if orig_phrase.find('|') > -1:
130
                    # TODO alternation
136
                    # TODO alternation
131
                    pass
137
                    pass
132
                else:
138
                else:
133
                    # TODO optional parts
139
                    # TODO optional parts
134
                    pass
140
                    pass
135
141
136
            if orig_phrase.find(';') > -1:
142
            if orig_phrase.find(';') > -1:
137
                synonyms = map(
143
                synonyms = map(
138
                    lambda x: sub(re_braces, r'\1', x),
144
                    lambda x: sub(re_braces, r'\1', x),
139
                    split(re_semicolon, orig_phrase))
145
                    split(re_semicolon, orig_phrase))
140
146
141
                for synonym in synonyms:
147
                for synonym in synonyms:
142
                    self[synonym] = data
148
                    self[synonym] = data
143
149
144
                del self[orig_phrase]
150
                del self[orig_phrase]
145
            else:
151
            else:
146
                m = match(re_braces, orig_phrase)
152
                m = match(re_braces, orig_phrase)
147
                if m is not None:
153
                if m is not None:
148
                    phrase = m.group('phrase')
154
                    phrase = m.group('phrase')
149
155
150
                    if callable(getattr(self, 'clean_entry', None)):
156
                    if callable(getattr(self, 'clean_entry', None)):
151
                        phrase = self.clean_entry(phrase)
157
                        phrase = self.clean_entry(phrase)
152
158
153
                    m_parens = search(re_parens, phrase)
159
                    m_parens = search(re_parens, phrase)
154
                    if m_parens is not None:
160
                    if m_parens is not None:
155
                        # alternation and optional parts
161
                        # alternation and optional parts
156
                        expr = sub(re_parens_no_alt, r'(?:\1)?', phrase)
162
                        expr = sub(re_parens_no_alt, r'(?:\1)?', phrase)
157
                        expr = sub('~', '(?=.)', expr)
163
                        expr = sub('~', '(?=.)', expr)
158
                        self._expressions[expr] = data
164
                        self._expressions[expr] = data
159
                    else:
165
                    else:
160
                        # remove braces
166
                        # remove braces
161
                        self[phrase] = data
167
                        self[phrase] = data
162
168
163
                    del self[orig_phrase]
169
                    del self[orig_phrase]
164
170
165
    def translate (self, phrase):
171
    def translate (self, phrase):
166
        """
172
        """
167
        Translate a phrase according to this dictionary.
173
        Translate a phrase according to this dictionary.
168
        For language-specific processing, this method should be
174
        For language-specific processing, this method should be
169
        called/overridden by inheriting classes.
175
        called/overridden by inheriting classes.
170
        :param phrase:
176
        :param phrase:
171
        :type phrase: str
177
        :type phrase: str
172
        """
178
        """
173
        translation = self.get(phrase.lower(), None)
179
        translation = self.get(phrase.lower(), None)
174
        if translation is not None:
180
        if translation is not None:
175
            translation[self._language_key] = phrase
181
            translation[self._language_key] = phrase
176
            return translation
182
            return translation
177
183
178
        return None
184
        return None
179
185
180
    def translate_expression (self, phrase):
186
    def translate_expression (self, phrase):
181
        """
187
        """
182
        Translate a phrase according entries in this dictionary
188
        Translate a phrase according entries in this dictionary
183
        based on regular expressions.
189
        based on regular expressions.
184
        :param phrase:
190
        :param phrase:
185
        :type phrase:
191
        :type phrase:
186
        """
192
        """
187
        for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])):
193
        for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])):
188
            expression_match = match(r'{0}$'.format(expression), phrase)
194
            expression_match = match(r'{0}$'.format(expression), phrase)
189
            if expression_match is not None:
195
            if expression_match is not None:
190
                data[self._language_key] = expression_match.group(0)
196
                data[self._language_key] = expression_match.group(0)
191
                return data
197
                return data
192
198
193
        return None
199
        return None
194
 
200