Rev 294 | Rev 296 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
293 | PointedEar | 1 | """ |
2 | Created on 2014-10-20 |
||
3 | |||
4 | @author: Thomas 'PointedEars' Lahn <mail@PointedEars.de> |
||
5 | |||
6 | """ |
||
7 | |||
8 | from os import chdir, stat |
||
9 | from sys import stderr |
||
10 | from os.path import dirname, realpath, basename |
||
11 | from pickle import dump, load |
||
12 | from re import match, DOTALL, search, sub, split, compile |
||
13 | |||
14 | debug_level = 2 |
||
15 | |||
16 | def dmsg(*args, **kwargs): |
||
17 | if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None: |
||
18 | kwargs['min_level'] = 1 |
||
19 | |||
20 | if not hasattr(kwargs, 'file'): |
||
21 | kwargs['file'] = stderr |
||
22 | |||
23 | if debug_level >= kwargs['min_level']: |
||
24 | del kwargs['min_level'] |
||
25 | print(*args, **kwargs) |
||
26 | |||
27 | def sort_dict_alnum_english_key(phrase): |
||
28 | return sub(r'\{(.+)\}', r'\1', phrase[0]).lower() |
||
29 | |||
30 | class Dictionary(dict): |
||
31 | """ |
||
32 | classdocs |
||
33 | |||
34 | """ |
||
35 | _keys = "ipa|en|lit|pos|com|tag|ex" |
||
36 | _expressions = {} |
||
37 | |||
294 | PointedEar | 38 | def load (self, dictionary_file, language_key='en'): |
293 | PointedEar | 39 | dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1) |
40 | |||
41 | chdir(dirname(realpath(__file__))) |
||
42 | |||
43 | pickle_file = basename(dictionary_file) + '.pickle' |
||
44 | |||
45 | try: |
||
46 | pickle_mtime = stat(pickle_file).st_mtime |
||
47 | except FileNotFoundError: |
||
48 | pickle_mtime = None |
||
49 | |||
50 | if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime: |
||
51 | dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1) |
||
52 | phrase = None |
||
53 | key = None |
||
54 | value = None |
||
55 | with open(dictionary_file) as f: |
||
56 | indent = None |
||
57 | |||
58 | for line in f: |
||
294 | PointedEar | 59 | m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(language_key), line) |
293 | PointedEar | 60 | if m is not None: |
61 | phrase = m.group("phrase") |
||
295 | PointedEar | 62 | self[phrase] = {} |
293 | PointedEar | 63 | indent = None |
64 | else: |
||
65 | m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line) |
||
66 | if m is not None: |
||
67 | # join previous value if necessary |
||
68 | if type(value) == list: |
||
295 | PointedEar | 69 | self[phrase][key] = ' '.join(value) |
293 | PointedEar | 70 | |
71 | indent = m.group("indent") |
||
72 | key = m.group("key") |
||
73 | value = m.group("value") |
||
74 | # assign a string for memory efficiency |
||
295 | PointedEar | 75 | self[phrase][key] = value |
293 | PointedEar | 76 | elif indent is not None: |
77 | m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line) |
||
78 | if m is not None: |
||
79 | if len(m.group("indent")) == len(indent) + 2: |
||
80 | continuation = m.group("continuation") |
||
81 | if type(value) == str: |
||
82 | # when a continuation is first found, convert to a list |
||
83 | # because there could be more continuations |
||
295 | PointedEar | 84 | value = self[phrase][key] = [value, continuation] |
293 | PointedEar | 85 | else: |
86 | value.append(continuation) |
||
87 | |||
88 | # join last value if necessary |
||
89 | if type(value) == list: |
||
295 | PointedEar | 90 | self[phrase][key] = ' '.join(value) |
293 | PointedEar | 91 | |
92 | dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1) |
||
93 | # TODO: Pickle should only contain strings to be small |
||
295 | PointedEar | 94 | with open(pickle_file, mode='wb') as f: dump(self, f) |
293 | PointedEar | 95 | dmsg(' done.', min_level=1) |
96 | else: |
||
97 | dmsg('from {0} ...'.format(pickle_file), end='', min_level=1) |
||
98 | with open(pickle_file, mode='rb') as f: pickle = load(f) |
||
99 | for key, value in pickle.items(): |
||
295 | PointedEar | 100 | self[key] = value |
293 | PointedEar | 101 | |
295 | PointedEar | 102 | dmsg(' done ({0} entries).'.format(len(self)), min_level=1) |
293 | PointedEar | 103 | |
104 | def clean (self): |
||
105 | parens_re = compile(r'\(.+\)', DOTALL) |
||
295 | PointedEar | 106 | braces_re = compile( |
107 | r'^\s*\{(?P<phrase>.+)\}(?:\s*\((?P<variant>.+?)\))?\s*$', |
||
108 | DOTALL) |
||
293 | PointedEar | 109 | semicolon_re = compile(r'\s*;\s*') |
110 | |||
295 | PointedEar | 111 | for orig_phrase, data in list(self.items()): |
293 | PointedEar | 112 | # if there are optional or alternating parts |
113 | if search(parens_re, orig_phrase): |
||
114 | if orig_phrase.find('|') > -1: |
||
115 | # TODO alternation |
||
116 | pass |
||
117 | else: |
||
118 | # TODO optional parts |
||
119 | pass |
||
120 | |||
121 | if orig_phrase.find(';') > -1: |
||
122 | synonyms = map( |
||
123 | lambda x: sub(braces_re, r'\1', x), |
||
124 | split(semicolon_re, orig_phrase)) |
||
125 | |||
126 | for synonym in synonyms: |
||
295 | PointedEar | 127 | self[synonym] = data |
293 | PointedEar | 128 | |
295 | PointedEar | 129 | del self[orig_phrase] |
293 | PointedEar | 130 | else: |
131 | m = match(braces_re, orig_phrase) |
||
132 | if m is not None: |
||
295 | PointedEar | 133 | phrase = m.group("phrase") |
134 | m2 = match(parens_re, phrase) |
||
135 | if m2 is not None: |
||
136 | # TODO alternation and optional parts |
||
137 | pass |
||
138 | |||
139 | self[phrase] = data |
||
140 | del self[orig_phrase] |