Rev 298 | Only display areas with differences | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed
| Rev 298 | Rev 300 | ||
|---|---|---|---|
| 1 | """
|
1 | """
|
| 2 | Created on 2014-10-20
|
2 | Created on 2014-10-20
|
| 3 | 3 | ||
| 4 | @author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
|
4 | @author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
|
| 5 | 5 | ||
| 6 | """
|
6 | """
|
| 7 | 7 | ||
| 8 | from os import chdir, stat |
8 | from os import chdir, stat |
| 9 | from sys import stderr |
9 | from sys import stderr |
| 10 | from os.path import dirname, realpath, basename |
10 | from os.path import dirname, realpath, basename |
| 11 | from pickle import dump, load |
11 | from pickle import dump, load |
| 12 | from re import match, DOTALL, search, sub, split, compile |
12 | from re import match, DOTALL, search, sub, split, compile |
| 13 | 13 | ||
| 14 | debug_level = 2 |
14 | debug_level = 2 |
| 15 | 15 | ||
| 16 | def dmsg(*args, **kwargs): |
16 | def dmsg (*args, **kwargs): |
| 17 | if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None: |
- | |
| 18 | kwargs['min_level'] = 1 |
- | |
| 19 | - | ||
| 20 | if not hasattr(kwargs, 'file'): |
17 | if not kwargs.get('file'): |
| 21 | kwargs['file'] = stderr |
18 | kwargs['file'] = stderr |
| 22 | 19 | ||
| 23 | if debug_level >= kwargs['min_level']: |
20 | min_level = kwargs.pop('min_level', 1) |
| - | 21 | ||
| 24 | del kwargs['min_level'] |
22 | if debug_level >= min_level: |
| 25 | print(*args, **kwargs) |
23 | print(*args, **kwargs) |
| 26 | 24 | ||
| 27 | def sort_dict_alnum_english_key(phrase): |
25 | def sort_dict_alnum_english_key (phrase): |
| 28 | return sub(r'\{(.+)\}', r'\1', phrase[0]).lower() |
26 | return sub(r'\{(.+)\}', r'\1', phrase[0]).lower() |
| 29 | 27 | ||
| 30 | class Dictionary(dict): |
28 | class Dictionary (dict): |
| 31 | """
|
29 | """
|
| 32 | A Dictionary (not to be confused with its ancestor, dict)
|
30 | A Dictionary (not to be confused with its ancestor, dict)
|
| 33 | represents a word dictionary stored in a file.
|
31 | represents a word dictionary stored in a file.
|
| 34 |
|
32 |
|
| 35 | """
|
33 | """
|
| 36 | _language_key = 'en' |
34 | _language_key = 'en' |
| 37 | _keys = "ipa|en|lit|pos|com|tag|ex" |
35 | _keys = "ipa|en|lit|pos|com|tag|ex" |
| 38 | _expressions = {} |
36 | _expressions = {} |
| 39 | 37 | ||
| 40 | def load (self, dictionary_file, language_key='en'): |
38 | def load (self, dictionary_file, keys=None, language_key=None): |
| 41 | """
|
39 | """
|
| 42 | Loads a word dictionary from a file.
|
40 | Loads a word dictionary from a file.
|
| 43 | :param dictionary_file:
|
41 | :param dictionary_file:
|
| 44 | :type dictionary_file:
|
42 | :type dictionary_file:
|
| 45 | :param language_key:
|
43 | :param language_key:
|
| 46 | :type language_key:
|
44 | :type language_key:
|
| 47 | """
|
45 | """
|
| - | 46 | if keys is not None: |
|
| - | 47 | self._keys = keys |
|
| - | 48 | ||
| - | 49 | if language_key is not None: |
|
| 48 | self._language_key = language_key |
50 | self._language_key = language_key |
| 49 | 51 | ||
| 50 | dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1) |
52 | dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1) |
| 51 | 53 | ||
| 52 | chdir(dirname(realpath(__file__))) |
54 | chdir(dirname(realpath(__file__))) |
| 53 | 55 | ||
| 54 | pickle_file = basename(dictionary_file) + '.pickle' |
56 | pickle_file = basename(dictionary_file) + '.pickle' |
| 55 | 57 | ||
| 56 | try:
|
58 | try:
|
| 57 | pickle_mtime = stat(pickle_file).st_mtime |
59 | pickle_mtime = stat(pickle_file).st_mtime |
| 58 | except FileNotFoundError:
|
60 | except FileNotFoundError:
|
| 59 | pickle_mtime = None |
61 | pickle_mtime = None |
| 60 | 62 | ||
| 61 | if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime: |
63 | if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime: |
| 62 | dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1) |
64 | dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1) |
| - | 65 | ||
| 63 | phrase = None |
66 | phrase = None |
| 64 | key = None |
67 | key = None |
| 65 | value = None |
68 | value = None |
| 66 | with open(dictionary_file) as f: |
69 | with open(dictionary_file) as f: |
| 67 | indent = None |
70 | indent = None |
| 68 | 71 | ||
| 69 | for line in f: |
72 | for line in f: |
| 70 | m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(self._language_key), line) |
73 | m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(self._language_key), line) |
| 71 | if m is not None: |
74 | if m is not None: |
| 72 | phrase = m.group("phrase") |
75 | phrase = m.group("phrase") |
| 73 | self[phrase] = {} |
76 | self[phrase] = {} |
| 74 | indent = None |
77 | indent = None |
| 75 | else:
|
78 | else:
|
| 76 | m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line) |
79 | m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line) |
| 77 | if m is not None: |
80 | if m is not None: |
| 78 | # join previous value if necessary
|
81 | # join previous value if necessary
|
| 79 | if type(value) == list: |
82 | if type(value) == list: |
| 80 | self[phrase][key] = ' '.join(value) |
83 | self[phrase][key] = ' '.join(value) |
| 81 | 84 | ||
| 82 | indent = m.group("indent") |
85 | indent = m.group("indent") |
| 83 | key = m.group("key") |
86 | key = m.group("key") |
| 84 | value = m.group("value") |
87 | value = m.group("value") |
| 85 | # assign a string for memory efficiency
|
88 | # assign a string for memory efficiency
|
| 86 | self[phrase][key] = value |
89 | self[phrase][key] = value |
| 87 | elif indent is not None: |
90 | elif indent is not None: |
| 88 | m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line) |
91 | m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line) |
| 89 | if m is not None: |
92 | if m is not None: |
| 90 | if len(m.group("indent")) == len(indent) + 2: |
93 | if len(m.group("indent")) == len(indent) + 2: |
| 91 | continuation = m.group("continuation") |
94 | continuation = m.group("continuation") |
| 92 | if type(value) == str: |
95 | if type(value) == str: |
| 93 | # when a continuation is first found, convert to a list
|
96 | # when a continuation is first found, convert to a list
|
| 94 | # because there could be more continuations
|
97 | # because there could be more continuations
|
| 95 | value = self[phrase][key] = [value, continuation] |
98 | value = self[phrase][key] = [value, continuation] |
| 96 | else:
|
99 | else:
|
| 97 | value.append(continuation) |
100 | value.append(continuation) |
| 98 | 101 | ||
| 99 | # join last value if necessary
|
102 | # join last value if necessary
|
| 100 | if type(value) == list: |
103 | if type(value) == list: |
| 101 | self[phrase][key] = ' '.join(value) |
104 | self[phrase][key] = ' '.join(value) |
| 102 | 105 | ||
| 103 | dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1) |
106 | dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1) |
| - | 107 | ||
| 104 | # TODO: Pickle should only contain strings to be small
|
108 | # TODO: Pickle should only contain strings to be small
|
| 105 | with open(pickle_file, mode='wb') as f: dump(self, f) |
109 | with open(pickle_file, mode='wb') as f: dump(self, f) |
| - | 110 | ||
| 106 | dmsg(' done.', min_level=1) |
111 | dmsg(' done.', min_level=1) |
| 107 | else:
|
112 | else:
|
| 108 | dmsg('from {0} ...'.format(pickle_file), end='', min_level=1) |
113 | dmsg('from {0} ...'.format(pickle_file), end='', min_level=1) |
| - | 114 | ||
| 109 | with open(pickle_file, mode='rb') as f: pickle = load(f) |
115 | with open(pickle_file, mode='rb') as f: pickle = load(f) |
| 110 | for key, value in pickle.items(): |
116 | for key, value in pickle.items(): |
| 111 | self[key] = value |
117 | self[key] = value |
| 112 | 118 | ||
| 113 | dmsg(' done ({0} entries).'.format(len(self)), min_level=1) |
119 | dmsg(' done ({0} entries).'.format(len(self)), min_level=1) |
| 114 | 120 | ||
| 115 | def clean (self): |
121 | def clean (self): |
| 116 | """
|
122 | """
|
| 117 | Cleans dictionary entries
|
123 | Cleans dictionary entries
|
| 118 | """
|
124 | """
|
| 119 | re_parens = compile(r'\(.+\)', DOTALL) |
125 | re_parens = compile(r'\(.+\)', DOTALL) |
| 120 | re_parens_no_alt = compile(r'\(([^\|]+)\)', DOTALL) |
126 | re_parens_no_alt = compile(r'\(([^\|]+)\)', DOTALL) |
| 121 | re_braces = compile( |
127 | re_braces = compile( |
| 122 | r'^\s*\{(?P<phrase>.+)\}(?:\s*\((?P<variant>.+?)\))?\s*$', |
128 | r'^\s*\{(?P<phrase>.+)\}(?:\s*\((?P<variant>.+?)\))?\s*$', |
| 123 | DOTALL)
|
129 | DOTALL)
|
| 124 | re_semicolon = compile(r'\s*;\s*') |
130 | re_semicolon = compile(r'\s*;\s*') |
| 125 | 131 | ||
| 126 | for orig_phrase, data in list(self.items()): |
132 | for orig_phrase, data in list(self.items()): |
| 127 | # if there are optional or alternating parts
|
133 | # if there are optional or alternating parts
|
| 128 | if search(re_parens, orig_phrase): |
134 | if search(re_parens, orig_phrase): |
| 129 | if orig_phrase.find('|') > -1: |
135 | if orig_phrase.find('|') > -1: |
| 130 | # TODO alternation
|
136 | # TODO alternation
|
| 131 | pass
|
137 | pass
|
| 132 | else:
|
138 | else:
|
| 133 | # TODO optional parts
|
139 | # TODO optional parts
|
| 134 | pass
|
140 | pass
|
| 135 | 141 | ||
| 136 | if orig_phrase.find(';') > -1: |
142 | if orig_phrase.find(';') > -1: |
| 137 | synonyms = map( |
143 | synonyms = map( |
| 138 | lambda x: sub(re_braces, r'\1', x), |
144 | lambda x: sub(re_braces, r'\1', x), |
| 139 | split(re_semicolon, orig_phrase)) |
145 | split(re_semicolon, orig_phrase)) |
| 140 | 146 | ||
| 141 | for synonym in synonyms: |
147 | for synonym in synonyms: |
| 142 | self[synonym] = data |
148 | self[synonym] = data |
| 143 | 149 | ||
| 144 | del self[orig_phrase] |
150 | del self[orig_phrase] |
| 145 | else:
|
151 | else:
|
| 146 | m = match(re_braces, orig_phrase) |
152 | m = match(re_braces, orig_phrase) |
| 147 | if m is not None: |
153 | if m is not None: |
| 148 | phrase = m.group('phrase') |
154 | phrase = m.group('phrase') |
| 149 | 155 | ||
| 150 | if callable(getattr(self, 'clean_entry', None)): |
156 | if callable(getattr(self, 'clean_entry', None)): |
| 151 | phrase = self.clean_entry(phrase) |
157 | phrase = self.clean_entry(phrase) |
| 152 | 158 | ||
| 153 | m_parens = search(re_parens, phrase) |
159 | m_parens = search(re_parens, phrase) |
| 154 | if m_parens is not None: |
160 | if m_parens is not None: |
| 155 | # alternation and optional parts
|
161 | # alternation and optional parts
|
| 156 | expr = sub(re_parens_no_alt, r'(?:\1)?', phrase) |
162 | expr = sub(re_parens_no_alt, r'(?:\1)?', phrase) |
| 157 | expr = sub('~', '(?=.)', expr) |
163 | expr = sub('~', '(?=.)', expr) |
| 158 | self._expressions[expr] = data |
164 | self._expressions[expr] = data |
| 159 | else:
|
165 | else:
|
| 160 | # remove braces
|
166 | # remove braces
|
| 161 | self[phrase] = data |
167 | self[phrase] = data |
| 162 | 168 | ||
| 163 | del self[orig_phrase] |
169 | del self[orig_phrase] |
| 164 | 170 | ||
| 165 | def translate (self, phrase): |
171 | def translate (self, phrase): |
| 166 | """
|
172 | """
|
| 167 | Translate a phrase according to this dictionary.
|
173 | Translate a phrase according to this dictionary.
|
| 168 | For language-specific processing, this method should be
|
174 | For language-specific processing, this method should be
|
| 169 | called/overridden by inheriting classes.
|
175 | called/overridden by inheriting classes.
|
| 170 | :param phrase:
|
176 | :param phrase:
|
| 171 | :type phrase: str
|
177 | :type phrase: str
|
| 172 | """
|
178 | """
|
| 173 | translation = self.get(phrase.lower(), None) |
179 | translation = self.get(phrase.lower(), None) |
| 174 | if translation is not None: |
180 | if translation is not None: |
| 175 | translation[self._language_key] = phrase |
181 | translation[self._language_key] = phrase |
| 176 | return translation
|
182 | return translation
|
| 177 | 183 | ||
| 178 | return None |
184 | return None |
| 179 | 185 | ||
| 180 | def translate_expression (self, phrase): |
186 | def translate_expression (self, phrase): |
| 181 | """
|
187 | """
|
| 182 | Translate a phrase according entries in this dictionary
|
188 | Translate a phrase according entries in this dictionary
|
| 183 | based on regular expressions.
|
189 | based on regular expressions.
|
| 184 | :param phrase:
|
190 | :param phrase:
|
| 185 | :type phrase:
|
191 | :type phrase:
|
| 186 | """
|
192 | """
|
| 187 | for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])): |
193 | for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])): |
| 188 | expression_match = match(r'{0}$'.format(expression), phrase) |
194 | expression_match = match(r'{0}$'.format(expression), phrase) |
| 189 | if expression_match is not None: |
195 | if expression_match is not None: |
| 190 | data[self._language_key] = expression_match.group(0) |
196 | data[self._language_key] = expression_match.group(0) |
| 191 | return data
|
197 | return data
|
| 192 | 198 | ||
| 193 | return None |
199 | return None |
| 194 | 200 | ||