Subversion Repositories LCARS

Rev

Rev 297 | Go to most recent revision | Only display areas with differences | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 297 Rev 298
1
"""
1
"""
2
Created on 2014-10-20
2
Created on 2014-10-20
3

3

4
@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
4
@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
5

5

6
"""
6
"""
7
7
8
from os import chdir, stat
8
from os import chdir, stat
9
from sys import stderr
9
from sys import stderr
10
from os.path import dirname, realpath, basename
10
from os.path import dirname, realpath, basename
11
from pickle import dump, load
11
from pickle import dump, load
12
from re import match, DOTALL, search, sub, split, compile
12
from re import match, DOTALL, search, sub, split, compile
13
13
14
debug_level = 2
14
debug_level = 2
15
15
16
def dmsg(*args, **kwargs):
16
def dmsg(*args, **kwargs):
17
    if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
17
    if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
18
        kwargs['min_level'] = 1
18
        kwargs['min_level'] = 1
19
19
20
    if not hasattr(kwargs, 'file'):
20
    if not hasattr(kwargs, 'file'):
21
        kwargs['file'] = stderr
21
        kwargs['file'] = stderr
22
22
23
    if debug_level >= kwargs['min_level']:
23
    if debug_level >= kwargs['min_level']:
24
        del kwargs['min_level']
24
        del kwargs['min_level']
25
        print(*args, **kwargs)
25
        print(*args, **kwargs)
26
26
27
def sort_dict_alnum_english_key(phrase):
27
def sort_dict_alnum_english_key(phrase):
28
    return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()
28
    return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()
29
29
30
class Dictionary(dict):
30
class Dictionary(dict):
31
    """
31
    """
32
    A Dictionary (not to be confused with its ancestor, dict)
32
    A Dictionary (not to be confused with its ancestor, dict)
33
    represents a word dictionary stored in a file.
33
    represents a word dictionary stored in a file.
34
   
34
   
35
    """
35
    """
36
    _language_key = 'en'
36
    _language_key = 'en'
37
    _keys = "ipa|en|lit|pos|com|tag|ex"
37
    _keys = "ipa|en|lit|pos|com|tag|ex"
38
    _expressions = {}
38
    _expressions = {}
39
39
40
    def load (self, dictionary_file, language_key='en'):
40
    def load (self, dictionary_file, language_key='en'):
41
        """
41
        """
42
        Loads a word dictionary from a file.
42
        Loads a word dictionary from a file.
43
        :param dictionary_file:
43
        :param dictionary_file:
44
        :type dictionary_file:
44
        :type dictionary_file:
45
        :param language_key:
45
        :param language_key:
46
        :type language_key:
46
        :type language_key:
47
        """
47
        """
48
        self._language_key = language_key
48
        self._language_key = language_key
49
49
50
        dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)
50
        dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)
51
51
52
        chdir(dirname(realpath(__file__)))
52
        chdir(dirname(realpath(__file__)))
53
53
54
        pickle_file = basename(dictionary_file) + '.pickle'
54
        pickle_file = basename(dictionary_file) + '.pickle'
55
55
56
        try:
56
        try:
57
            pickle_mtime = stat(pickle_file).st_mtime
57
            pickle_mtime = stat(pickle_file).st_mtime
58
        except FileNotFoundError:
58
        except FileNotFoundError:
59
            pickle_mtime = None
59
            pickle_mtime = None
60
60
61
        if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
61
        if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
62
            dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
62
            dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
63
            phrase = None
63
            phrase = None
64
            key = None
64
            key = None
65
            value = None
65
            value = None
66
            with open(dictionary_file) as f:
66
            with open(dictionary_file) as f:
67
                indent = None
67
                indent = None
68
68
69
                for line in f:
69
                for line in f:
70
                    m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(self._language_key), line)
70
                    m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(self._language_key), line)
71
                    if m is not None:
71
                    if m is not None:
72
                        phrase = m.group("phrase")
72
                        phrase = m.group("phrase")
73
                        self[phrase] = {}
73
                        self[phrase] = {}
74
                        indent = None
74
                        indent = None
75
                    else:
75
                    else:
76
                        m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line)
76
                        m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line)
77
                        if m is not None:
77
                        if m is not None:
78
                            # join previous value if necessary
78
                            # join previous value if necessary
79
                            if type(value) == list:
79
                            if type(value) == list:
80
                                self[phrase][key] = ' '.join(value)
80
                                self[phrase][key] = ' '.join(value)
81
81
82
                            indent = m.group("indent")
82
                            indent = m.group("indent")
83
                            key = m.group("key")
83
                            key = m.group("key")
84
                            value = m.group("value")
84
                            value = m.group("value")
85
                            # assign a string for memory efficiency
85
                            # assign a string for memory efficiency
86
                            self[phrase][key] = value
86
                            self[phrase][key] = value
87
                        elif indent is not None:
87
                        elif indent is not None:
88
                            m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
88
                            m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
89
                            if m is not None:
89
                            if m is not None:
90
                                if len(m.group("indent")) == len(indent) + 2:
90
                                if len(m.group("indent")) == len(indent) + 2:
91
                                    continuation = m.group("continuation")
91
                                    continuation = m.group("continuation")
92
                                    if type(value) == str:
92
                                    if type(value) == str:
93
                                        # when a continuation is first found, convert to a list
93
                                        # when a continuation is first found, convert to a list
94
                                        # because there could be more continuations
94
                                        # because there could be more continuations
95
                                        value = self[phrase][key] = [value, continuation]
95
                                        value = self[phrase][key] = [value, continuation]
96
                                    else:
96
                                    else:
97
                                        value.append(continuation)
97
                                        value.append(continuation)
98
98
99
            # join last value if necessary
99
            # join last value if necessary
100
            if type(value) == list:
100
            if type(value) == list:
101
                self[phrase][key] = ' '.join(value)
101
                self[phrase][key] = ' '.join(value)
102
102
103
            dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
103
            dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
104
            # TODO: Pickle should only contain strings to be small
104
            # TODO: Pickle should only contain strings to be small
105
            with open(pickle_file, mode='wb') as f: dump(self, f)
105
            with open(pickle_file, mode='wb') as f: dump(self, f)
106
            dmsg(' done.', min_level=1)
106
            dmsg(' done.', min_level=1)
107
        else:
107
        else:
108
            dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
108
            dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
109
            with open(pickle_file, mode='rb') as f: pickle = load(f)
109
            with open(pickle_file, mode='rb') as f: pickle = load(f)
110
            for key, value in pickle.items():
110
            for key, value in pickle.items():
111
                self[key] = value
111
                self[key] = value
112
112
113
        dmsg(' done ({0} entries).'.format(len(self)), min_level=1)
113
        dmsg(' done ({0} entries).'.format(len(self)), min_level=1)
114
114
115
    def clean (self):
115
    def clean (self):
116
        """
116
        """
117
        Cleans dictionary entries
117
        Cleans dictionary entries
118
        """
118
        """
119
        re_parens = compile(r'\(.+\)', DOTALL)
119
        re_parens = compile(r'\(.+\)', DOTALL)
120
        re_parens_no_alt = compile(r'\(([^\|]+)\)', DOTALL)
120
        re_parens_no_alt = compile(r'\(([^\|]+)\)', DOTALL)
121
        re_braces = compile(
121
        re_braces = compile(
122
            r'^\s*\{(?P<phrase>.+)\}(?:\s*\((?P<variant>.+?)\))?\s*$',
122
            r'^\s*\{(?P<phrase>.+)\}(?:\s*\((?P<variant>.+?)\))?\s*$',
123
            DOTALL)
123
            DOTALL)
124
        re_semicolon = compile(r'\s*;\s*')
124
        re_semicolon = compile(r'\s*;\s*')
125
125
126
        for orig_phrase, data in list(self.items()):
126
        for orig_phrase, data in list(self.items()):
127
            # if there are optional or alternating parts
127
            # if there are optional or alternating parts
128
            if search(re_parens, orig_phrase):
128
            if search(re_parens, orig_phrase):
129
                if orig_phrase.find('|') > -1:
129
                if orig_phrase.find('|') > -1:
130
                    # TODO alternation
130
                    # TODO alternation
131
                    pass
131
                    pass
132
                else:
132
                else:
133
                    # TODO optional parts
133
                    # TODO optional parts
134
                    pass
134
                    pass
135
135
136
            if orig_phrase.find(';') > -1:
136
            if orig_phrase.find(';') > -1:
137
                synonyms = map(
137
                synonyms = map(
138
                    lambda x: sub(re_braces, r'\1', x),
138
                    lambda x: sub(re_braces, r'\1', x),
139
                    split(re_semicolon, orig_phrase))
139
                    split(re_semicolon, orig_phrase))
140
140
141
                for synonym in synonyms:
141
                for synonym in synonyms:
142
                    self[synonym] = data
142
                    self[synonym] = data
143
143
144
                del self[orig_phrase]
144
                del self[orig_phrase]
145
            else:
145
            else:
146
                m = match(re_braces, orig_phrase)
146
                m = match(re_braces, orig_phrase)
147
                if m is not None:
147
                if m is not None:
148
                    phrase = m.group('phrase')
148
                    phrase = m.group('phrase')
149
149
150
                    if callable(getattr(self, 'clean_entry', None)):
150
                    if callable(getattr(self, 'clean_entry', None)):
151
                        phrase = self.clean_entry(phrase)
151
                        phrase = self.clean_entry(phrase)
152
152
153
                    m_parens = search(re_parens, phrase)
153
                    m_parens = search(re_parens, phrase)
154
                    if m_parens is not None:
154
                    if m_parens is not None:
155
                        # alternation and optional parts
155
                        # alternation and optional parts
156
                        expr = sub(re_parens_no_alt, r'(?:\1)?', phrase)
156
                        expr = sub(re_parens_no_alt, r'(?:\1)?', phrase)
157
                        expr = sub('~', '(?=.)', expr)
157
                        expr = sub('~', '(?=.)', expr)
158
                        self._expressions[expr] = data
158
                        self._expressions[expr] = data
159
                    else:
159
                    else:
160
                        # remove braces
160
                        # remove braces
161
                        self[phrase] = data
161
                        self[phrase] = data
162
162
163
                    del self[orig_phrase]
163
                    del self[orig_phrase]
164
164
165
    def translate (self, phrase):
165
    def translate (self, phrase):
166
        """
166
        """
167
        Translate a phrase according to this dictionary.
167
        Translate a phrase according to this dictionary.
168
        For language-specific processing, this method should be
168
        For language-specific processing, this method should be
169
        called/overridden by inheriting classes.
169
        called/overridden by inheriting classes.
170
        :param phrase:
170
        :param phrase:
171
        :type phrase: str
171
        :type phrase: str
172
        """
172
        """
173
        translation = self.get(phrase.lower(), None)
173
        translation = self.get(phrase.lower(), None)
174
        if translation is not None:
174
        if translation is not None:
175
            translation[self._language_key] = phrase
175
            translation[self._language_key] = phrase
176
            return translation
176
            return translation
177
177
178
        return None
178
        return None
179
179
180
    def translate_expression (self, phrase):
180
    def translate_expression (self, phrase):
181
        """
181
        """
182
        Translate a phrase according entries in this dictionary
182
        Translate a phrase according entries in this dictionary
183
        based on regular expressions.
183
        based on regular expressions.
184
        :param phrase:
184
        :param phrase:
185
        :type phrase:
185
        :type phrase:
186
        """
186
        """
187
        for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])):
187
        for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])):
188
            expression_match = match(expression, phrase)
188
            expression_match = match(r'{0}$'.format(expression), phrase)
189
            if expression_match is not None:
189
            if expression_match is not None:
190
                data[self._language_key] = expression_match.group(0)
190
                data[self._language_key] = expression_match.group(0)
191
                return data
191
                return data
192
192
193
        return None
193
        return None
194
 
194