Subversion Repositories LCARS

Rev

Rev 296 | Rev 298 | Go to most recent revision | Only display areas with differences | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 296 Rev 297
1
"""
1
"""
2
Created on 2014-10-20
2
Created on 2014-10-20
3

3

4
@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
4
@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
5

5

6
"""
6
"""
7
7
8
from os import chdir, stat
8
from os import chdir, stat
9
from sys import stderr
9
from sys import stderr
10
from os.path import dirname, realpath, basename
10
from os.path import dirname, realpath, basename
11
from pickle import dump, load
11
from pickle import dump, load
12
from re import match, DOTALL, search, sub, split, compile
12
from re import match, DOTALL, search, sub, split, compile
13
13
14
debug_level = 2
14
debug_level = 2
15
15
16
def dmsg(*args, **kwargs):
16
def dmsg(*args, **kwargs):
17
    if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
17
    if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
18
        kwargs['min_level'] = 1
18
        kwargs['min_level'] = 1
19
19
20
    if not hasattr(kwargs, 'file'):
20
    if not hasattr(kwargs, 'file'):
21
        kwargs['file'] = stderr
21
        kwargs['file'] = stderr
22
22
23
    if debug_level >= kwargs['min_level']:
23
    if debug_level >= kwargs['min_level']:
24
        del kwargs['min_level']
24
        del kwargs['min_level']
25
        print(*args, **kwargs)
25
        print(*args, **kwargs)
26
26
27
def sort_dict_alnum_english_key(phrase):
27
def sort_dict_alnum_english_key(phrase):
28
    return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()
28
    return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()
29
29
30
class Dictionary(dict):
30
class Dictionary(dict):
31
    """
31
    """
-
 
32
    A Dictionary (not to be confused with its ancestor, dict)
32
    classdocs
33
    represents a word dictionary stored in a file.
33
   
34
   
34
    """
35
    """
35
    _language_key = 'en'
36
    _language_key = 'en'
36
    _keys = "ipa|en|lit|pos|com|tag|ex"
37
    _keys = "ipa|en|lit|pos|com|tag|ex"
37
    _expressions = {}
38
    _expressions = {}
38
39
39
    def load (self, dictionary_file, language_key='en'):
40
    def load (self, dictionary_file, language_key='en'):
-
 
41
        """
-
 
42
        Loads a word dictionary from a file.
-
 
43
        :param dictionary_file:
-
 
44
        :type dictionary_file:
-
 
45
        :param language_key:
-
 
46
        :type language_key:
-
 
47
        """
40
        self._language_key = language_key
48
        self._language_key = language_key
41
49
42
        dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)
50
        dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)
43
51
44
        chdir(dirname(realpath(__file__)))
52
        chdir(dirname(realpath(__file__)))
45
53
46
        pickle_file = basename(dictionary_file) + '.pickle'
54
        pickle_file = basename(dictionary_file) + '.pickle'
47
55
48
        try:
56
        try:
49
            pickle_mtime = stat(pickle_file).st_mtime
57
            pickle_mtime = stat(pickle_file).st_mtime
50
        except FileNotFoundError:
58
        except FileNotFoundError:
51
            pickle_mtime = None
59
            pickle_mtime = None
52
60
53
        if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
61
        if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
54
            dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
62
            dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
55
            phrase = None
63
            phrase = None
56
            key = None
64
            key = None
57
            value = None
65
            value = None
58
            with open(dictionary_file) as f:
66
            with open(dictionary_file) as f:
59
                indent = None
67
                indent = None
60
68
61
                for line in f:
69
                for line in f:
62
                    m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(self._language_key), line)
70
                    m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(self._language_key), line)
63
                    if m is not None:
71
                    if m is not None:
64
                        phrase = m.group("phrase")
72
                        phrase = m.group("phrase")
65
                        self[phrase] = {}
73
                        self[phrase] = {}
66
                        indent = None
74
                        indent = None
67
                    else:
75
                    else:
68
                        m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line)
76
                        m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line)
69
                        if m is not None:
77
                        if m is not None:
70
                            # join previous value if necessary
78
                            # join previous value if necessary
71
                            if type(value) == list:
79
                            if type(value) == list:
72
                                self[phrase][key] = ' '.join(value)
80
                                self[phrase][key] = ' '.join(value)
73
81
74
                            indent = m.group("indent")
82
                            indent = m.group("indent")
75
                            key = m.group("key")
83
                            key = m.group("key")
76
                            value = m.group("value")
84
                            value = m.group("value")
77
                            # assign a string for memory efficiency
85
                            # assign a string for memory efficiency
78
                            self[phrase][key] = value
86
                            self[phrase][key] = value
79
                        elif indent is not None:
87
                        elif indent is not None:
80
                            m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
88
                            m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
81
                            if m is not None:
89
                            if m is not None:
82
                                if len(m.group("indent")) == len(indent) + 2:
90
                                if len(m.group("indent")) == len(indent) + 2:
83
                                    continuation = m.group("continuation")
91
                                    continuation = m.group("continuation")
84
                                    if type(value) == str:
92
                                    if type(value) == str:
85
                                        # when a continuation is first found, convert to a list
93
                                        # when a continuation is first found, convert to a list
86
                                        # because there could be more continuations
94
                                        # because there could be more continuations
87
                                        value = self[phrase][key] = [value, continuation]
95
                                        value = self[phrase][key] = [value, continuation]
88
                                    else:
96
                                    else:
89
                                        value.append(continuation)
97
                                        value.append(continuation)
90
98
91
            # join last value if necessary
99
            # join last value if necessary
92
            if type(value) == list:
100
            if type(value) == list:
93
                self[phrase][key] = ' '.join(value)
101
                self[phrase][key] = ' '.join(value)
94
102
95
            dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
103
            dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
96
            # TODO: Pickle should only contain strings to be small
104
            # TODO: Pickle should only contain strings to be small
97
            with open(pickle_file, mode='wb') as f: dump(self, f)
105
            with open(pickle_file, mode='wb') as f: dump(self, f)
98
            dmsg(' done.', min_level=1)
106
            dmsg(' done.', min_level=1)
99
        else:
107
        else:
100
            dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
108
            dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
101
            with open(pickle_file, mode='rb') as f: pickle = load(f)
109
            with open(pickle_file, mode='rb') as f: pickle = load(f)
102
            for key, value in pickle.items():
110
            for key, value in pickle.items():
103
                self[key] = value
111
                self[key] = value
104
112
105
        dmsg(' done ({0} entries).'.format(len(self)), min_level=1)
113
        dmsg(' done ({0} entries).'.format(len(self)), min_level=1)
106
114
107
    def clean (self):
115
    def clean (self):
-
 
116
        """
-
 
117
        Cleans dictionary entries
-
 
118
        """
108
        re_parens = compile(r'\(.+\)', DOTALL)
119
        re_parens = compile(r'\(.+\)', DOTALL)
109
        re_parens_no_alt = compile(r'\(([^|]+)\)', DOTALL)
120
        re_parens_no_alt = compile(r'\(([^\|]+)\)', DOTALL)
110
        re_braces = compile(
121
        re_braces = compile(
111
            r'^\s*\{(?P<phrase>.+)\}(?:\s*\((?P<variant>.+?)\))?\s*$',
122
            r'^\s*\{(?P<phrase>.+)\}(?:\s*\((?P<variant>.+?)\))?\s*$',
112
            DOTALL)
123
            DOTALL)
113
        re_semicolon = compile(r'\s*;\s*')
124
        re_semicolon = compile(r'\s*;\s*')
114
125
115
        for orig_phrase, data in list(self.items()):
126
        for orig_phrase, data in list(self.items()):
116
            # if there are optional or alternating parts
127
            # if there are optional or alternating parts
117
            if search(re_parens, orig_phrase):
128
            if search(re_parens, orig_phrase):
118
                if orig_phrase.find('|') > -1:
129
                if orig_phrase.find('|') > -1:
119
                    # TODO alternation
130
                    # TODO alternation
120
                    pass
131
                    pass
121
                else:
132
                else:
122
                    # TODO optional parts
133
                    # TODO optional parts
123
                    pass
134
                    pass
124
135
125
            if orig_phrase.find(';') > -1:
136
            if orig_phrase.find(';') > -1:
126
                synonyms = map(
137
                synonyms = map(
127
                    lambda x: sub(re_braces, r'\1', x),
138
                    lambda x: sub(re_braces, r'\1', x),
128
                    split(re_semicolon, orig_phrase))
139
                    split(re_semicolon, orig_phrase))
129
140
130
                for synonym in synonyms:
141
                for synonym in synonyms:
131
                    self[synonym] = data
142
                    self[synonym] = data
132
143
133
                del self[orig_phrase]
144
                del self[orig_phrase]
134
            else:
145
            else:
135
                m = match(re_braces, orig_phrase)
146
                m = match(re_braces, orig_phrase)
136
                if m is not None:
147
                if m is not None:
137
                    phrase = m.group("phrase")
148
                    phrase = m.group('phrase')
-
 
149
-
 
150
                    if callable(getattr(self, 'clean_entry', None)):
-
 
151
                        phrase = self.clean_entry(phrase)
-
 
152
138
                    m_parens = search(re_parens, phrase)
153
                    m_parens = search(re_parens, phrase)
139
                    if m_parens is not None:
154
                    if m_parens is not None:
140
                        # alternation and optional parts
155
                        # alternation and optional parts
141
                        expr = sub(re_parens_no_alt, r'(?:\1)?', phrase)
156
                        expr = sub(re_parens_no_alt, r'(?:\1)?', phrase)
142
                        expr = sub('~', '(?=.)', expr)
157
                        expr = sub('~', '(?=.)', expr)
143
                        self._expressions[expr] = data
158
                        self._expressions[expr] = data
144
                    else:
159
                    else:
145
                        # remove braces
160
                        # remove braces
146
                        self[phrase] = data
161
                        self[phrase] = data
147
162
148
                    del self[orig_phrase]
163
                    del self[orig_phrase]
149
164
150
    def translate (self, phrase):
165
    def translate (self, phrase):
-
 
166
        """
-
 
167
        Translate a phrase according to this dictionary.
-
 
168
        For language-specific processing, this method should be
-
 
169
        called/overridden by inheriting classes.
-
 
170
        :param phrase:
-
 
171
        :type phrase: str
-
 
172
        """
151
        translation = self.get(phrase.lower(), None)
173
        translation = self.get(phrase.lower(), None)
152
        if translation is not None:
174
        if translation is not None:
153
            translation[self._language_key] = phrase
175
            translation[self._language_key] = phrase
154
            return translation
176
            return translation
155
177
156
        return None
178
        return None
157
179
158
    def translate_expression (self, phrase):
180
    def translate_expression (self, phrase):
-
 
181
        """
-
 
182
        Translate a phrase according entries in this dictionary
-
 
183
        based on regular expressions.
-
 
184
        :param phrase:
-
 
185
        :type phrase:
-
 
186
        """
159
        for expression, data in list(self._expressions.items()):
187
        for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])):
160
            expression_match = match(expression, phrase)
188
            expression_match = match(expression, phrase)
161
            if expression_match is not None:
189
            if expression_match is not None:
162
                data[self._language_key] = expression_match.group(0)
190
                data[self._language_key] = expression_match.group(0)
163
                return data
191
                return data
164
192
165
        return None
193
        return None
166
 
194