Rev 298 | Details | Compare with Previous | Last modification | View Log | RSS feed
| Rev | Author | Line No. | Line | 
|---|---|---|---|
| 293 | PointedEar | 1 | """ | 
| 2 | Created on 2014-10-20 | ||
| 3 | |||
| 4 | @author: Thomas 'PointedEars' Lahn <mail@PointedEars.de> | ||
| 5 | |||
| 6 | """ | ||
| 7 | |||
| 8 | from os import chdir, stat | ||
| 9 | from sys import stderr | ||
| 10 | from os.path import dirname, realpath, basename | ||
| 11 | from pickle import dump, load | ||
| 12 | from re import match, DOTALL, search, sub, split, compile | ||
| 13 | |||
| 14 | debug_level = 2 | ||
| 15 | |||
| 300 | PointedEar | 16 | def dmsg (*args, **kwargs): | 
| 17 | if not kwargs.get('file'): | ||
| 293 | PointedEar | 18 | kwargs['file'] = stderr | 
| 19 | |||
| 300 | PointedEar | 20 | min_level = kwargs.pop('min_level', 1) | 
| 21 | |||
| 22 | if debug_level >= min_level: | ||
| 293 | PointedEar | 23 | print(*args, **kwargs) | 
| 24 | |||
| 300 | PointedEar | 25 | def sort_dict_alnum_english_key (phrase): | 
| 293 | PointedEar | 26 | return sub(r'\{(.+)\}', r'\1', phrase[0]).lower() | 
| 27 | |||
| 300 | PointedEar | 28 | class Dictionary (dict): | 
| 293 | PointedEar | 29 |     """ | 
| 297 | PointedEar | 30 |     A Dictionary (not to be confused with its ancestor, dict) | 
| 31 |     represents a word dictionary stored in a file. | ||
| 293 | PointedEar | 32 | |
| 33 |     """ | ||
| 296 | PointedEar | 34 | _language_key = 'en' | 
| 293 | PointedEar | 35 | _keys = "ipa|en|lit|pos|com|tag|ex" | 
| 36 | _expressions = {} | ||
| 37 | |||
| 300 | PointedEar | 38 | def load (self, dictionary_file, keys=None, language_key=None): | 
| 297 | PointedEar | 39 |         """ | 
| 40 |         Loads a word dictionary from a file. | ||
| 41 |         :param dictionary_file: | ||
| 42 |         :type dictionary_file: | ||
| 43 |         :param language_key: | ||
| 44 |         :type language_key: | ||
| 45 |         """ | ||
| 300 | PointedEar | 46 | if keys is not None: | 
| 47 | self._keys = keys | ||
| 296 | PointedEar | 48 | |
| 300 | PointedEar | 49 | if language_key is not None: | 
| 50 | self._language_key = language_key | ||
| 51 | |||
| 293 | PointedEar | 52 | dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1) | 
| 53 | |||
| 54 | chdir(dirname(realpath(__file__))) | ||
| 55 | |||
| 56 | pickle_file = basename(dictionary_file) + '.pickle' | ||
| 57 | |||
| 58 |         try: | ||
| 59 | pickle_mtime = stat(pickle_file).st_mtime | ||
| 60 |         except FileNotFoundError: | ||
| 61 | pickle_mtime = None | ||
| 62 | |||
| 63 | if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime: | ||
| 64 | dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1) | ||
| 300 | PointedEar | 65 | |
| 293 | PointedEar | 66 | phrase = None | 
| 67 | key = None | ||
| 68 | value = None | ||
| 69 | with open(dictionary_file) as f: | ||
| 70 | indent = None | ||
| 71 | |||
| 72 | for line in f: | ||
| 296 | PointedEar | 73 | m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(self._language_key), line) | 
| 293 | PointedEar | 74 | if m is not None: | 
| 75 | phrase = m.group("phrase") | ||
| 295 | PointedEar | 76 | self[phrase] = {} | 
| 293 | PointedEar | 77 | indent = None | 
| 78 |                     else: | ||
| 79 | m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line) | ||
| 80 | if m is not None: | ||
| 81 |                             # join previous value if necessary | ||
| 82 | if type(value) == list: | ||
| 295 | PointedEar | 83 | self[phrase][key] = ' '.join(value) | 
| 293 | PointedEar | 84 | |
| 85 | indent = m.group("indent") | ||
| 86 | key = m.group("key") | ||
| 87 | value = m.group("value") | ||
| 88 |                             # assign a string for memory efficiency | ||
| 295 | PointedEar | 89 | self[phrase][key] = value | 
| 293 | PointedEar | 90 | elif indent is not None: | 
| 91 | m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line) | ||
| 92 | if m is not None: | ||
| 93 | if len(m.group("indent")) == len(indent) + 2: | ||
| 94 | continuation = m.group("continuation") | ||
| 95 | if type(value) == str: | ||
| 96 |                                         # when a continuation is first found, convert to a list | ||
| 97 |                                         # because there could be more continuations | ||
| 295 | PointedEar | 98 | value = self[phrase][key] = [value, continuation] | 
| 293 | PointedEar | 99 |                                     else: | 
| 100 | value.append(continuation) | ||
| 101 | |||
| 102 |             # join last value if necessary | ||
| 103 | if type(value) == list: | ||
| 295 | PointedEar | 104 | self[phrase][key] = ' '.join(value) | 
| 293 | PointedEar | 105 | |
| 106 | dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1) | ||
| 300 | PointedEar | 107 | |
| 293 | PointedEar | 108 |             # TODO: Pickle should only contain strings to be small | 
| 295 | PointedEar | 109 | with open(pickle_file, mode='wb') as f: dump(self, f) | 
| 300 | PointedEar | 110 | |
| 293 | PointedEar | 111 | dmsg(' done.', min_level=1) | 
| 112 |         else: | ||
| 113 | dmsg('from {0} ...'.format(pickle_file), end='', min_level=1) | ||
| 300 | PointedEar | 114 | |
| 293 | PointedEar | 115 | with open(pickle_file, mode='rb') as f: pickle = load(f) | 
| 116 | for key, value in pickle.items(): | ||
| 295 | PointedEar | 117 | self[key] = value | 
| 293 | PointedEar | 118 | |
| 295 | PointedEar | 119 | dmsg(' done ({0} entries).'.format(len(self)), min_level=1) | 
| 293 | PointedEar | 120 | |
| 121 | def clean (self): | ||
| 297 | PointedEar | 122 |         """ | 
| 123 |         Cleans dictionary entries | ||
| 124 |         """ | ||
| 296 | PointedEar | 125 | re_parens = compile(r'\(.+\)', DOTALL) | 
| 297 | PointedEar | 126 | re_parens_no_alt = compile(r'\(([^\|]+)\)', DOTALL) | 
| 296 | PointedEar | 127 | re_braces = compile( | 
| 295 | PointedEar | 128 | r'^\s*\{(?P<phrase>.+)\}(?:\s*\((?P<variant>.+?)\))?\s*$', | 
| 129 |             DOTALL) | ||
| 296 | PointedEar | 130 | re_semicolon = compile(r'\s*;\s*') | 
| 293 | PointedEar | 131 | |
| 295 | PointedEar | 132 | for orig_phrase, data in list(self.items()): | 
| 293 | PointedEar | 133 |             # if there are optional or alternating parts | 
| 296 | PointedEar | 134 | if search(re_parens, orig_phrase): | 
| 293 | PointedEar | 135 | if orig_phrase.find('|') > -1: | 
| 136 |                     # TODO alternation | ||
| 137 |                     pass | ||
| 138 |                 else: | ||
| 139 |                     # TODO optional parts | ||
| 140 |                     pass | ||
| 141 | |||
| 142 | if orig_phrase.find(';') > -1: | ||
| 143 | synonyms = map( | ||
| 296 | PointedEar | 144 | lambda x: sub(re_braces, r'\1', x), | 
| 145 | split(re_semicolon, orig_phrase)) | ||
| 293 | PointedEar | 146 | |
| 147 | for synonym in synonyms: | ||
| 295 | PointedEar | 148 | self[synonym] = data | 
| 293 | PointedEar | 149 | |
| 295 | PointedEar | 150 | del self[orig_phrase] | 
| 293 | PointedEar | 151 |             else: | 
| 296 | PointedEar | 152 | m = match(re_braces, orig_phrase) | 
| 293 | PointedEar | 153 | if m is not None: | 
| 297 | PointedEar | 154 | phrase = m.group('phrase') | 
| 155 | |||
| 156 | if callable(getattr(self, 'clean_entry', None)): | ||
| 157 | phrase = self.clean_entry(phrase) | ||
| 158 | |||
| 296 | PointedEar | 159 | m_parens = search(re_parens, phrase) | 
| 160 | if m_parens is not None: | ||
| 161 |                         # alternation and optional parts | ||
| 162 | expr = sub(re_parens_no_alt, r'(?:\1)?', phrase) | ||
| 163 | expr = sub('~', '(?=.)', expr) | ||
| 164 | self._expressions[expr] = data | ||
| 165 |                     else: | ||
| 166 |                         # remove braces | ||
| 167 | self[phrase] = data | ||
| 295 | PointedEar | 168 | |
| 169 | del self[orig_phrase] | ||
| 296 | PointedEar | 170 | |
| 171 | def translate (self, phrase): | ||
| 297 | PointedEar | 172 |         """ | 
| 173 |         Translate a phrase according to this dictionary. | ||
| 174 |         For language-specific processing, this method should be | ||
| 175 |         called/overridden by inheriting classes. | ||
| 176 |         :param phrase: | ||
| 177 |         :type phrase: str | ||
| 178 |         """ | ||
| 296 | PointedEar | 179 | translation = self.get(phrase.lower(), None) | 
| 180 | if translation is not None: | ||
| 181 | translation[self._language_key] = phrase | ||
| 182 |             return translation | ||
| 183 | |||
| 184 | return None | ||
| 185 | |||
| 186 | def translate_expression (self, phrase): | ||
| 297 | PointedEar | 187 |         """ | 
| 188 |         Translate a phrase according entries in this dictionary | ||
| 189 |         based on regular expressions. | ||
| 190 |         :param phrase: | ||
| 191 |         :type phrase: | ||
| 192 |         """ | ||
| 193 | for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])): | ||
| 298 | PointedEar | 194 | expression_match = match(r'{0}$'.format(expression), phrase) | 
| 296 | PointedEar | 195 | if expression_match is not None: | 
| 196 | data[self._language_key] = expression_match.group(0) | ||
| 197 |                 return data | ||
| 198 | |||
| 199 | return None |