Rev 298 | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
293 | PointedEar | 1 | """ |
2 | Created on 2014-10-20 |
||
3 | |||
4 | @author: Thomas 'PointedEars' Lahn <mail@PointedEars.de> |
||
5 | |||
6 | """ |
||
7 | |||
8 | from os import chdir, stat |
||
9 | from sys import stderr |
||
10 | from os.path import dirname, realpath, basename |
||
11 | from pickle import dump, load |
||
12 | from re import match, DOTALL, search, sub, split, compile |
||
13 | |||
14 | debug_level = 2 |
||
15 | |||
300 | PointedEar | 16 | def dmsg (*args, **kwargs): |
17 | if not kwargs.get('file'): |
||
293 | PointedEar | 18 | kwargs['file'] = stderr |
19 | |||
300 | PointedEar | 20 | min_level = kwargs.pop('min_level', 1) |
21 | |||
22 | if debug_level >= min_level: |
||
293 | PointedEar | 23 | print(*args, **kwargs) |
24 | |||
300 | PointedEar | 25 | def sort_dict_alnum_english_key (phrase): |
293 | PointedEar | 26 | return sub(r'\{(.+)\}', r'\1', phrase[0]).lower() |
27 | |||
300 | PointedEar | 28 | class Dictionary (dict): |
293 | PointedEar | 29 | """ |
297 | PointedEar | 30 | A Dictionary (not to be confused with its ancestor, dict) |
31 | represents a word dictionary stored in a file. |
||
293 | PointedEar | 32 | |
33 | """ |
||
296 | PointedEar | 34 | _language_key = 'en' |
293 | PointedEar | 35 | _keys = "ipa|en|lit|pos|com|tag|ex" |
36 | _expressions = {} |
||
37 | |||
300 | PointedEar | 38 | def load (self, dictionary_file, keys=None, language_key=None): |
297 | PointedEar | 39 | """ |
40 | Loads a word dictionary from a file. |
||
41 | :param dictionary_file: |
||
42 | :type dictionary_file: |
||
43 | :param language_key: |
||
44 | :type language_key: |
||
45 | """ |
||
300 | PointedEar | 46 | if keys is not None: |
47 | self._keys = keys |
||
296 | PointedEar | 48 | |
300 | PointedEar | 49 | if language_key is not None: |
50 | self._language_key = language_key |
||
51 | |||
293 | PointedEar | 52 | dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1) |
53 | |||
54 | chdir(dirname(realpath(__file__))) |
||
55 | |||
56 | pickle_file = basename(dictionary_file) + '.pickle' |
||
57 | |||
58 | try: |
||
59 | pickle_mtime = stat(pickle_file).st_mtime |
||
60 | except FileNotFoundError: |
||
61 | pickle_mtime = None |
||
62 | |||
63 | if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime: |
||
64 | dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1) |
||
300 | PointedEar | 65 | |
293 | PointedEar | 66 | phrase = None |
67 | key = None |
||
68 | value = None |
||
69 | with open(dictionary_file) as f: |
||
70 | indent = None |
||
71 | |||
72 | for line in f: |
||
296 | PointedEar | 73 | m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(self._language_key), line) |
293 | PointedEar | 74 | if m is not None: |
75 | phrase = m.group("phrase") |
||
295 | PointedEar | 76 | self[phrase] = {} |
293 | PointedEar | 77 | indent = None |
78 | else: |
||
79 | m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line) |
||
80 | if m is not None: |
||
81 | # join previous value if necessary |
||
82 | if type(value) == list: |
||
295 | PointedEar | 83 | self[phrase][key] = ' '.join(value) |
293 | PointedEar | 84 | |
85 | indent = m.group("indent") |
||
86 | key = m.group("key") |
||
87 | value = m.group("value") |
||
88 | # assign a string for memory efficiency |
||
295 | PointedEar | 89 | self[phrase][key] = value |
293 | PointedEar | 90 | elif indent is not None: |
91 | m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line) |
||
92 | if m is not None: |
||
93 | if len(m.group("indent")) == len(indent) + 2: |
||
94 | continuation = m.group("continuation") |
||
95 | if type(value) == str: |
||
96 | # when a continuation is first found, convert to a list |
||
97 | # because there could be more continuations |
||
295 | PointedEar | 98 | value = self[phrase][key] = [value, continuation] |
293 | PointedEar | 99 | else: |
100 | value.append(continuation) |
||
101 | |||
102 | # join last value if necessary |
||
103 | if type(value) == list: |
||
295 | PointedEar | 104 | self[phrase][key] = ' '.join(value) |
293 | PointedEar | 105 | |
106 | dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1) |
||
300 | PointedEar | 107 | |
293 | PointedEar | 108 | # TODO: Pickle should only contain strings to be small |
295 | PointedEar | 109 | with open(pickle_file, mode='wb') as f: dump(self, f) |
300 | PointedEar | 110 | |
293 | PointedEar | 111 | dmsg(' done.', min_level=1) |
112 | else: |
||
113 | dmsg('from {0} ...'.format(pickle_file), end='', min_level=1) |
||
300 | PointedEar | 114 | |
293 | PointedEar | 115 | with open(pickle_file, mode='rb') as f: pickle = load(f) |
116 | for key, value in pickle.items(): |
||
295 | PointedEar | 117 | self[key] = value |
293 | PointedEar | 118 | |
295 | PointedEar | 119 | dmsg(' done ({0} entries).'.format(len(self)), min_level=1) |
293 | PointedEar | 120 | |
121 | def clean (self): |
||
297 | PointedEar | 122 | """ |
123 | Cleans dictionary entries |
||
124 | """ |
||
296 | PointedEar | 125 | re_parens = compile(r'\(.+\)', DOTALL) |
297 | PointedEar | 126 | re_parens_no_alt = compile(r'\(([^\|]+)\)', DOTALL) |
296 | PointedEar | 127 | re_braces = compile( |
295 | PointedEar | 128 | r'^\s*\{(?P<phrase>.+)\}(?:\s*\((?P<variant>.+?)\))?\s*$', |
129 | DOTALL) |
||
296 | PointedEar | 130 | re_semicolon = compile(r'\s*;\s*') |
293 | PointedEar | 131 | |
295 | PointedEar | 132 | for orig_phrase, data in list(self.items()): |
293 | PointedEar | 133 | # if there are optional or alternating parts |
296 | PointedEar | 134 | if search(re_parens, orig_phrase): |
293 | PointedEar | 135 | if orig_phrase.find('|') > -1: |
136 | # TODO alternation |
||
137 | pass |
||
138 | else: |
||
139 | # TODO optional parts |
||
140 | pass |
||
141 | |||
142 | if orig_phrase.find(';') > -1: |
||
143 | synonyms = map( |
||
296 | PointedEar | 144 | lambda x: sub(re_braces, r'\1', x), |
145 | split(re_semicolon, orig_phrase)) |
||
293 | PointedEar | 146 | |
147 | for synonym in synonyms: |
||
295 | PointedEar | 148 | self[synonym] = data |
293 | PointedEar | 149 | |
295 | PointedEar | 150 | del self[orig_phrase] |
293 | PointedEar | 151 | else: |
296 | PointedEar | 152 | m = match(re_braces, orig_phrase) |
293 | PointedEar | 153 | if m is not None: |
297 | PointedEar | 154 | phrase = m.group('phrase') |
155 | |||
156 | if callable(getattr(self, 'clean_entry', None)): |
||
157 | phrase = self.clean_entry(phrase) |
||
158 | |||
296 | PointedEar | 159 | m_parens = search(re_parens, phrase) |
160 | if m_parens is not None: |
||
161 | # alternation and optional parts |
||
162 | expr = sub(re_parens_no_alt, r'(?:\1)?', phrase) |
||
163 | expr = sub('~', '(?=.)', expr) |
||
164 | self._expressions[expr] = data |
||
165 | else: |
||
166 | # remove braces |
||
167 | self[phrase] = data |
||
295 | PointedEar | 168 | |
169 | del self[orig_phrase] |
||
296 | PointedEar | 170 | |
171 | def translate (self, phrase): |
||
297 | PointedEar | 172 | """ |
173 | Translate a phrase according to this dictionary. |
||
174 | For language-specific processing, this method should be |
||
175 | called/overridden by inheriting classes. |
||
176 | :param phrase: |
||
177 | :type phrase: str |
||
178 | """ |
||
296 | PointedEar | 179 | translation = self.get(phrase.lower(), None) |
180 | if translation is not None: |
||
181 | translation[self._language_key] = phrase |
||
182 | return translation |
||
183 | |||
184 | return None |
||
185 | |||
186 | def translate_expression (self, phrase): |
||
297 | PointedEar | 187 | """ |
188 | Translate a phrase according entries in this dictionary |
||
189 | based on regular expressions. |
||
190 | :param phrase: |
||
191 | :type phrase: |
||
192 | """ |
||
193 | for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])): |
||
298 | PointedEar | 194 | expression_match = match(r'{0}$'.format(expression), phrase) |
296 | PointedEar | 195 | if expression_match is not None: |
196 | data[self._language_key] = expression_match.group(0) |
||
197 | return data |
||
198 | |||
199 | return None |