Rev 297 | Go to most recent revision | Only display areas with differences | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed
Rev 297 | Rev 298 | ||
---|---|---|---|
1 | """
|
1 | """
|
2 | Created on 2014-10-20
|
2 | Created on 2014-10-20
|
3 | 3 | ||
4 | @author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
|
4 | @author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
|
5 | 5 | ||
6 | """
|
6 | """
|
7 | 7 | ||
8 | from os import chdir, stat |
8 | from os import chdir, stat |
9 | from sys import stderr |
9 | from sys import stderr |
10 | from os.path import dirname, realpath, basename |
10 | from os.path import dirname, realpath, basename |
11 | from pickle import dump, load |
11 | from pickle import dump, load |
12 | from re import match, DOTALL, search, sub, split, compile |
12 | from re import match, DOTALL, search, sub, split, compile |
13 | 13 | ||
14 | debug_level = 2 |
14 | debug_level = 2 |
15 | 15 | ||
16 | def dmsg(*args, **kwargs): |
16 | def dmsg(*args, **kwargs): |
17 | if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None: |
17 | if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None: |
18 | kwargs['min_level'] = 1 |
18 | kwargs['min_level'] = 1 |
19 | 19 | ||
20 | if not hasattr(kwargs, 'file'): |
20 | if not hasattr(kwargs, 'file'): |
21 | kwargs['file'] = stderr |
21 | kwargs['file'] = stderr |
22 | 22 | ||
23 | if debug_level >= kwargs['min_level']: |
23 | if debug_level >= kwargs['min_level']: |
24 | del kwargs['min_level'] |
24 | del kwargs['min_level'] |
25 | print(*args, **kwargs) |
25 | print(*args, **kwargs) |
26 | 26 | ||
27 | def sort_dict_alnum_english_key(phrase): |
27 | def sort_dict_alnum_english_key(phrase): |
28 | return sub(r'\{(.+)\}', r'\1', phrase[0]).lower() |
28 | return sub(r'\{(.+)\}', r'\1', phrase[0]).lower() |
29 | 29 | ||
30 | class Dictionary(dict): |
30 | class Dictionary(dict): |
31 | """
|
31 | """
|
32 | A Dictionary (not to be confused with its ancestor, dict)
|
32 | A Dictionary (not to be confused with its ancestor, dict)
|
33 | represents a word dictionary stored in a file.
|
33 | represents a word dictionary stored in a file.
|
34 |
|
34 |
|
35 | """
|
35 | """
|
36 | _language_key = 'en' |
36 | _language_key = 'en' |
37 | _keys = "ipa|en|lit|pos|com|tag|ex" |
37 | _keys = "ipa|en|lit|pos|com|tag|ex" |
38 | _expressions = {} |
38 | _expressions = {} |
39 | 39 | ||
40 | def load (self, dictionary_file, language_key='en'): |
40 | def load (self, dictionary_file, language_key='en'): |
41 | """
|
41 | """
|
42 | Loads a word dictionary from a file.
|
42 | Loads a word dictionary from a file.
|
43 | :param dictionary_file:
|
43 | :param dictionary_file:
|
44 | :type dictionary_file:
|
44 | :type dictionary_file:
|
45 | :param language_key:
|
45 | :param language_key:
|
46 | :type language_key:
|
46 | :type language_key:
|
47 | """
|
47 | """
|
48 | self._language_key = language_key |
48 | self._language_key = language_key |
49 | 49 | ||
50 | dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1) |
50 | dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1) |
51 | 51 | ||
52 | chdir(dirname(realpath(__file__))) |
52 | chdir(dirname(realpath(__file__))) |
53 | 53 | ||
54 | pickle_file = basename(dictionary_file) + '.pickle' |
54 | pickle_file = basename(dictionary_file) + '.pickle' |
55 | 55 | ||
56 | try:
|
56 | try:
|
57 | pickle_mtime = stat(pickle_file).st_mtime |
57 | pickle_mtime = stat(pickle_file).st_mtime |
58 | except FileNotFoundError:
|
58 | except FileNotFoundError:
|
59 | pickle_mtime = None |
59 | pickle_mtime = None |
60 | 60 | ||
61 | if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime: |
61 | if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime: |
62 | dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1) |
62 | dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1) |
63 | phrase = None |
63 | phrase = None |
64 | key = None |
64 | key = None |
65 | value = None |
65 | value = None |
66 | with open(dictionary_file) as f: |
66 | with open(dictionary_file) as f: |
67 | indent = None |
67 | indent = None |
68 | 68 | ||
69 | for line in f: |
69 | for line in f: |
70 | m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(self._language_key), line) |
70 | m = match(r'^\s*{0}:\s*(?P<phrase>.+)'.format(self._language_key), line) |
71 | if m is not None: |
71 | if m is not None: |
72 | phrase = m.group("phrase") |
72 | phrase = m.group("phrase") |
73 | self[phrase] = {} |
73 | self[phrase] = {} |
74 | indent = None |
74 | indent = None |
75 | else:
|
75 | else:
|
76 | m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line) |
76 | m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(self._keys), line) |
77 | if m is not None: |
77 | if m is not None: |
78 | # join previous value if necessary
|
78 | # join previous value if necessary
|
79 | if type(value) == list: |
79 | if type(value) == list: |
80 | self[phrase][key] = ' '.join(value) |
80 | self[phrase][key] = ' '.join(value) |
81 | 81 | ||
82 | indent = m.group("indent") |
82 | indent = m.group("indent") |
83 | key = m.group("key") |
83 | key = m.group("key") |
84 | value = m.group("value") |
84 | value = m.group("value") |
85 | # assign a string for memory efficiency
|
85 | # assign a string for memory efficiency
|
86 | self[phrase][key] = value |
86 | self[phrase][key] = value |
87 | elif indent is not None: |
87 | elif indent is not None: |
88 | m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line) |
88 | m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line) |
89 | if m is not None: |
89 | if m is not None: |
90 | if len(m.group("indent")) == len(indent) + 2: |
90 | if len(m.group("indent")) == len(indent) + 2: |
91 | continuation = m.group("continuation") |
91 | continuation = m.group("continuation") |
92 | if type(value) == str: |
92 | if type(value) == str: |
93 | # when a continuation is first found, convert to a list
|
93 | # when a continuation is first found, convert to a list
|
94 | # because there could be more continuations
|
94 | # because there could be more continuations
|
95 | value = self[phrase][key] = [value, continuation] |
95 | value = self[phrase][key] = [value, continuation] |
96 | else:
|
96 | else:
|
97 | value.append(continuation) |
97 | value.append(continuation) |
98 | 98 | ||
99 | # join last value if necessary
|
99 | # join last value if necessary
|
100 | if type(value) == list: |
100 | if type(value) == list: |
101 | self[phrase][key] = ' '.join(value) |
101 | self[phrase][key] = ' '.join(value) |
102 | 102 | ||
103 | dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1) |
103 | dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1) |
104 | # TODO: Pickle should only contain strings to be small
|
104 | # TODO: Pickle should only contain strings to be small
|
105 | with open(pickle_file, mode='wb') as f: dump(self, f) |
105 | with open(pickle_file, mode='wb') as f: dump(self, f) |
106 | dmsg(' done.', min_level=1) |
106 | dmsg(' done.', min_level=1) |
107 | else:
|
107 | else:
|
108 | dmsg('from {0} ...'.format(pickle_file), end='', min_level=1) |
108 | dmsg('from {0} ...'.format(pickle_file), end='', min_level=1) |
109 | with open(pickle_file, mode='rb') as f: pickle = load(f) |
109 | with open(pickle_file, mode='rb') as f: pickle = load(f) |
110 | for key, value in pickle.items(): |
110 | for key, value in pickle.items(): |
111 | self[key] = value |
111 | self[key] = value |
112 | 112 | ||
113 | dmsg(' done ({0} entries).'.format(len(self)), min_level=1) |
113 | dmsg(' done ({0} entries).'.format(len(self)), min_level=1) |
114 | 114 | ||
115 | def clean (self): |
115 | def clean (self): |
116 | """
|
116 | """
|
117 | Cleans dictionary entries
|
117 | Cleans dictionary entries
|
118 | """
|
118 | """
|
119 | re_parens = compile(r'\(.+\)', DOTALL) |
119 | re_parens = compile(r'\(.+\)', DOTALL) |
120 | re_parens_no_alt = compile(r'\(([^\|]+)\)', DOTALL) |
120 | re_parens_no_alt = compile(r'\(([^\|]+)\)', DOTALL) |
121 | re_braces = compile( |
121 | re_braces = compile( |
122 | r'^\s*\{(?P<phrase>.+)\}(?:\s*\((?P<variant>.+?)\))?\s*$', |
122 | r'^\s*\{(?P<phrase>.+)\}(?:\s*\((?P<variant>.+?)\))?\s*$', |
123 | DOTALL)
|
123 | DOTALL)
|
124 | re_semicolon = compile(r'\s*;\s*') |
124 | re_semicolon = compile(r'\s*;\s*') |
125 | 125 | ||
126 | for orig_phrase, data in list(self.items()): |
126 | for orig_phrase, data in list(self.items()): |
127 | # if there are optional or alternating parts
|
127 | # if there are optional or alternating parts
|
128 | if search(re_parens, orig_phrase): |
128 | if search(re_parens, orig_phrase): |
129 | if orig_phrase.find('|') > -1: |
129 | if orig_phrase.find('|') > -1: |
130 | # TODO alternation
|
130 | # TODO alternation
|
131 | pass
|
131 | pass
|
132 | else:
|
132 | else:
|
133 | # TODO optional parts
|
133 | # TODO optional parts
|
134 | pass
|
134 | pass
|
135 | 135 | ||
136 | if orig_phrase.find(';') > -1: |
136 | if orig_phrase.find(';') > -1: |
137 | synonyms = map( |
137 | synonyms = map( |
138 | lambda x: sub(re_braces, r'\1', x), |
138 | lambda x: sub(re_braces, r'\1', x), |
139 | split(re_semicolon, orig_phrase)) |
139 | split(re_semicolon, orig_phrase)) |
140 | 140 | ||
141 | for synonym in synonyms: |
141 | for synonym in synonyms: |
142 | self[synonym] = data |
142 | self[synonym] = data |
143 | 143 | ||
144 | del self[orig_phrase] |
144 | del self[orig_phrase] |
145 | else:
|
145 | else:
|
146 | m = match(re_braces, orig_phrase) |
146 | m = match(re_braces, orig_phrase) |
147 | if m is not None: |
147 | if m is not None: |
148 | phrase = m.group('phrase') |
148 | phrase = m.group('phrase') |
149 | 149 | ||
150 | if callable(getattr(self, 'clean_entry', None)): |
150 | if callable(getattr(self, 'clean_entry', None)): |
151 | phrase = self.clean_entry(phrase) |
151 | phrase = self.clean_entry(phrase) |
152 | 152 | ||
153 | m_parens = search(re_parens, phrase) |
153 | m_parens = search(re_parens, phrase) |
154 | if m_parens is not None: |
154 | if m_parens is not None: |
155 | # alternation and optional parts
|
155 | # alternation and optional parts
|
156 | expr = sub(re_parens_no_alt, r'(?:\1)?', phrase) |
156 | expr = sub(re_parens_no_alt, r'(?:\1)?', phrase) |
157 | expr = sub('~', '(?=.)', expr) |
157 | expr = sub('~', '(?=.)', expr) |
158 | self._expressions[expr] = data |
158 | self._expressions[expr] = data |
159 | else:
|
159 | else:
|
160 | # remove braces
|
160 | # remove braces
|
161 | self[phrase] = data |
161 | self[phrase] = data |
162 | 162 | ||
163 | del self[orig_phrase] |
163 | del self[orig_phrase] |
164 | 164 | ||
165 | def translate (self, phrase): |
165 | def translate (self, phrase): |
166 | """
|
166 | """
|
167 | Translate a phrase according to this dictionary.
|
167 | Translate a phrase according to this dictionary.
|
168 | For language-specific processing, this method should be
|
168 | For language-specific processing, this method should be
|
169 | called/overridden by inheriting classes.
|
169 | called/overridden by inheriting classes.
|
170 | :param phrase:
|
170 | :param phrase:
|
171 | :type phrase: str
|
171 | :type phrase: str
|
172 | """
|
172 | """
|
173 | translation = self.get(phrase.lower(), None) |
173 | translation = self.get(phrase.lower(), None) |
174 | if translation is not None: |
174 | if translation is not None: |
175 | translation[self._language_key] = phrase |
175 | translation[self._language_key] = phrase |
176 | return translation
|
176 | return translation
|
177 | 177 | ||
178 | return None |
178 | return None |
179 | 179 | ||
180 | def translate_expression (self, phrase): |
180 | def translate_expression (self, phrase): |
181 | """
|
181 | """
|
182 | Translate a phrase according entries in this dictionary
|
182 | Translate a phrase according entries in this dictionary
|
183 | based on regular expressions.
|
183 | based on regular expressions.
|
184 | :param phrase:
|
184 | :param phrase:
|
185 | :type phrase:
|
185 | :type phrase:
|
186 | """
|
186 | """
|
187 | for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])): |
187 | for expression, data in sorted(self._expressions.items(), key=lambda item:-len(item[1])): |
188 | expression_match = match(expression, phrase) |
188 | expression_match = match(r'{0}$'.format(expression), phrase) |
189 | if expression_match is not None: |
189 | if expression_match is not None: |
190 | data[self._language_key] = expression_match.group(0) |
190 | data[self._language_key] = expression_match.group(0) |
191 | return data
|
191 | return data
|
192 | 192 | ||
193 | return None |
193 | return None |
194 | 194 |