Subversion Repositories LCARS

Rev

Rev 291 | Rev 293 | Go to most recent revision | Only display areas with differences | Ignore whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 291 Rev 292
-
 
1
#!/usr/bin/env python3
-
 
2
1
'''
3
'''
2
Created on 2014-10-20
4
Created on 2014-10-20
3

5

4
@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
6
@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
5
'''
7
'''
6
from sys import argv, stderr
8
from sys import argv, stderr
7
from re import findall, DOTALL, IGNORECASE, match, sub, compile, \
9
from re import findall, DOTALL, IGNORECASE, match, sub, compile, \
8
    split
10
    split
9
from os import chdir
11
from os import chdir, stat
10
from os.path import dirname, realpath
12
from os.path import dirname, realpath, basename
11
from collections import OrderedDict
13
from collections import OrderedDict
12
from functools import cmp_to_key
14
from functools import cmp_to_key
13
from copy import deepcopy
15
from copy import deepcopy
14
from collections.abc import MutableSequence
16
from collections.abc import MutableSequence
-
 
17
from pickle import dump, load
15
18
-
 
19
debug_level = 2
16
dictionary = {}
20
dictionary = {}
17
21
18
prepositions = {
22
prepositions = {
19
    "fi'": 'on',
23
    "fi'": 'on',
20
    "na'": 'at|to',
24
    "na'": 'at|to',
21
    "t'": 'of'
25
    "t'": 'of'
22
}
26
}
23
27
-
 
28
def dmsg(*args, **kwargs):
-
 
29
    if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
-
 
30
        kwargs['min_level'] = 1
-
 
31
-
 
32
    if not hasattr(kwargs, 'file'):
-
 
33
        kwargs['file'] = stderr
-
 
34
-
 
35
    if debug_level >= kwargs['min_level']:
-
 
36
        del kwargs['min_level']
-
 
37
        print(*args, **kwargs)
-
 
38
24
class MutableString2(MutableSequence):
39
class MutableString2(MutableSequence):
25
    def __init__(self, value=None):
40
    def __init__(self, value=None):
26
        self._values = [str(value)] if value is not None else []
41
        self._values = [str(value)] if value is not None else []
27
42
28
    def __add__(self, value):
43
    def __add__(self, value):
29
        return ''.join([self, value])
44
        return ''.join([self, value])
30
45
31
    def __delitem__(self):
46
    def __delitem__(self):
32
        raise NotImplementedError
47
        raise NotImplementedError
33
48
34
    def __getitem__(self, index):
49
    def __getitem__(self, index):
35
        return str(self)[index]
50
        return str(self)[index]
36
51
37
    def __len__(self):
52
    def __len__(self):
38
        return len(str(self))
53
        return len(str(self))
39
54
40
    def __repr__(self):
55
    def __repr__(self):
41
        return ''.join(self._values)
56
        return ''.join(self._values)
42
57
43
    def __setitem__(self, index, value):
58
    def __setitem__(self, index, value):
44
        raise NotImplementedError
59
        raise NotImplementedError
45
60
46
    def __str__(self):
61
    def __str__(self):
47
        return self.__repr__()
62
        return self.__repr__()
48
63
49
    def extend(self, values):
64
    def extend(self, values):
50
        self._values.append(values)
65
        self._values.append(values)
51
66
52
    def insert(self, index, value):
67
    def insert(self, index, value):
53
        raise NotImplementedError
68
        raise NotImplementedError
54
69
-
 
70
def cli_help():
-
 
71
    print('Usage: {0} TEXT...'.format(basename(argv[0])))
-
 
72
55
def load_dictionary(dictionary, dictionary_file):
73
def load_dictionary(dictionary, dictionary_file):
56
    print('Loading dictionary {0} ...'.format(dictionary_file), end='', file=stderr)
74
    dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)
57
75
58
    chdir(dirname(realpath(__file__)))
76
    chdir(dirname(realpath(__file__)))
59
    with open(dictionary_file) as f:
-
 
60
        keys = "ipa|en|lit|pos|com|tag|ex"
-
 
61
        indent = None
-
 
62
        value = None
-
 
63
77
-
 
78
    pickle_file = basename(dictionary_file) + '.pickle'
-
 
79
64
        for line in f:
80
    try:
65
            m = match(r'^\s*vuh:\s*(?P<phrase>.+)', line)
81
        pickle_mtime = stat(pickle_file).st_mtime
-
 
82
    except FileNotFoundError:
66
            if m is not None:
83
        pickle_mtime = None
-
 
84
-
 
85
    if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
-
 
86
        dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
67
                phrase = m.group("phrase")
87
        with open(dictionary_file) as f:
68
                dictionary[phrase] = {}
88
            keys = "ipa|en|lit|pos|com|tag|ex"
69
                indent = None
89
            indent = None
70
            else:
90
            value = None
-
 
91
71
                m = match(
92
            for line in f:
72
                    r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(keys),
93
                m = match(r'^\s*vuh:\s*(?P<phrase>.+)', line)
73
                    line)
-
 
74
                if m is not None:
94
                if m is not None:
75
                    indent = m.group("indent")
95
                    phrase = m.group("phrase")
76
                    key = m.group("key")
96
                    dictionary[phrase] = {}
77
                    value = m.group("value")
97
                    indent = None
78
                    value = dictionary[phrase][key] = MutableString2(value)
-
 
79
                elif indent is not None:
98
                else:
80
                    m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
99
                    m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(keys), line)
81
                    if m is not None:
100
                    if m is not None:
-
 
101
                        indent = m.group("indent")
-
 
102
                        key = m.group("key")
-
 
103
                        value = m.group("value")
-
 
104
                        value = dictionary[phrase][key] = MutableString2(value)
-
 
105
                    elif indent is not None:
-
 
106
                        m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
-
 
107
                        if m is not None:
82
                        if len(m.group("indent")) == len(indent) + 2:
108
                            if len(m.group("indent")) == len(indent) + 2:
83
                            dictionary[phrase][key] += (" " + m.group("continuation"))
109
                                dictionary[phrase][key] += (" " + m.group("continuation"))
-
 
110
-
 
111
        dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
-
 
112
        # TODO: Pickle should only contain strings to be small
-
 
113
        with open(pickle_file, mode='wb') as f: dump(dictionary, f)
-
 
114
        dmsg(' done.', min_level=1)
-
 
115
    else:
-
 
116
        dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
-
 
117
        with open(pickle_file, mode='rb') as f: pickle = load(f)
-
 
118
        for key, value in pickle.items():
-
 
119
            dictionary[key] = value
84
120
85
    print(' done ({0} entries).'.format(len(dictionary)), file=stderr)
121
    dmsg(' done ({0} entries).'.format(len(dictionary)), min_level=1)
86
122
87
def clean_dictionary(dictionary):
123
def clean_dictionary(dictionary):
88
    braces_re = compile(r'^\s*\{(.+)\}\s*$')
124
    braces_re = compile(r'^\s*\{(.+)\}\s*$')
89
    semicolon_re = compile(r'\s*;\s*')
125
    semicolon_re = compile(r'\s*;\s*')
90
126
91
    for orig_phrase, data in list(dictionary.items()):
127
    for orig_phrase, data in list(dictionary.items()):
92
        if orig_phrase.find(";") > -1:
128
        if orig_phrase.find(";") > -1:
93
            synonyms = map(
129
            synonyms = map(
94
                lambda x: sub(braces_re, r'\1', orig_phrase),
130
                lambda x: sub(braces_re, r'\1', orig_phrase),
95
                split(semicolon_re, orig_phrase))
131
                split(semicolon_re, orig_phrase))
96
132
97
            for synonym in synonyms:
133
            for synonym in synonyms:
98
                dictionary[synonym] = deepcopy(data)
134
                dictionary[synonym] = deepcopy(data)
99
135
100
            del dictionary[orig_phrase]
136
            del dictionary[orig_phrase]
101
        else:
137
        else:
102
            m = match(braces_re, orig_phrase)
138
            m = match(braces_re, orig_phrase)
103
            if m is not None:
139
            if m is not None:
104
                dictionary[m.group(1)] = deepcopy(dictionary[orig_phrase])
140
                dictionary[m.group(1)] = deepcopy(dictionary[orig_phrase])
105
                del dictionary[orig_phrase]
141
                del dictionary[orig_phrase]
106
142
107
def sort_dict_alnum_english_key(phrase):
143
def sort_dict_alnum_english_key(phrase):
108
    return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()
144
    return sub(r'\{(.+)\}', r'\1', phrase[0]).lower()
109
145
110
def get_sort_dict_alnum_vulcan_key():
146
def get_sort_dict_alnum_vulcan_key():
111
    letters = list(map(str.lower, [
147
    letters = list(map(str.lower, [
112
        " ", 'S', 'T', 'P', 'K', 'R', 'L', 'A', 'Sh', 'O', 'U', 'D',
148
        " ", 'S', 'T', 'P', 'K', 'R', 'L', 'A', 'Sh', 'O', 'U', 'D',
113
        'V', 'Kh', 'E', 'H', 'G', 'Ch', 'I', 'N', 'Zh', 'M', 'Y', 'F', 'Z',
149
        'V', 'Kh', 'E', 'H', 'G', 'Ch', 'I', 'N', 'Zh', 'M', 'Y', 'F', 'Z',
114
        'Th', 'W', 'B', "'", '-']))
150
        'Th', 'W', 'B', "'", '-']))
115
    letter_values = dict(map(lambda x: (x[1], x[0]), enumerate(letters)))
151
    letter_values = dict(map(lambda x: (x[1], x[0]), enumerate(letters)))
116
    letters_re = compile(r'(?:{0})'.format('|'.join(sorted(letters, key=lambda char:-len(char)))))
152
    letters_re = compile(r'(?:{0})'.format('|'.join(sorted(letters, key=lambda char:-len(char)))))
117
153
118
    def sort_dict_alnum_vulcan (a, b):
154
    def sort_dict_alnum_vulcan (a, b):
119
        # split into Vulcan letters
155
        # split into Vulcan letters
120
        a = findall(letters_re, sort_dict_alnum_english_key(a))
156
        a = findall(letters_re, sort_dict_alnum_english_key(a))
121
        b = findall(letters_re, sort_dict_alnum_english_key(b))
157
        b = findall(letters_re, sort_dict_alnum_english_key(b))
122
158
123
        if len(a) < len(b):
159
        if len(a) < len(b):
124
            for index, char in enumerate(a):
160
            for index, char in enumerate(a):
125
                diff = letter_values[char] - letter_values[b[index]]
161
                diff = letter_values[char] - letter_values[b[index]]
126
                if diff != 0:
162
                if diff != 0:
127
                    return diff
163
                    return diff
128
            return -1
164
            return -1
129
165
130
        # len(b) <= len(a)
166
        # len(b) <= len(a)
131
        for index, char in enumerate(b):
167
        for index, char in enumerate(b):
132
            diff = letter_values[a[index]] - letter_values[char]
168
            diff = letter_values[a[index]] - letter_values[char]
133
            if diff != 0:
169
            if diff != 0:
134
                return diff
170
                return diff
135
171
136
        return 1 if len(b) < len(a) else 0
172
        return 1 if len(b) < len(a) else 0
137
173
138
    return cmp_to_key(sort_dict_alnum_vulcan)
174
    return cmp_to_key(sort_dict_alnum_vulcan)
139
175
140
def translate (word, recursion=False):
176
def translate (phrase):
141
    translation = dictionary.get(word.lower(), None)
177
    translation = dictionary.get(phrase.lower(), None)
142
    if translation is not None:
178
    if translation is not None:
143
        translation = translation["en"]
-
 
144
        if match('[A-Z]', word):
-
 
145
            return sub('[a-z]', lambda ch: ch.group(0).upper(), str(translation), count=1)
-
 
146
        return translation
179
        return translation
147
180
148
    if not recursion:
-
 
149
        # prepositions attached?
-
 
150
        for prep, prep_transl in prepositions.items():
-
 
151
            if (match(prep, word)):
-
 
152
                real_word = word.replace(r'^' + prep, '')
-
 
153
                real_word_transl = translate(real_word, recursion=True)
-
 
154
                if real_word_transl is not None:
-
 
155
                    return prep_transl + ' ' + real_word_transl
-
 
156
-
 
157
    if recursion:
-
 
158
        return None
181
    return None
159
    else:
-
 
160
        # Not in dictionary: proper name or missing for other reasons
-
 
161
        return '{{{0}}}'.format(word)
-
 
162
182
163
if __name__ == '__main__':
183
if __name__ == '__main__':
-
 
184
    if len(argv) < 2:
-
 
185
        print('Nothing to translate.', end='\n\n', file=stderr)
-
 
186
        cli_help()
-
 
187
        exit(1)
-
 
188
-
 
189
    text = argv[1]
-
 
190
164
    load_dictionary(dictionary, 'vuh-gol-en.dict.zdb.txt')
191
    load_dictionary(dictionary, 'vuh-gol-en.dict.zdb.txt')
165
    clean_dictionary(dictionary)
192
    clean_dictionary(dictionary)
166
193
167
#     try:
194
#     try:
168
#         for phrase, data in OrderedDict(sorted(
195
#         for phrase, data in OrderedDict(sorted(
169
#             dictionary.items(),
196
#             dictionary.items(),
170
#             key=get_sort_dict_alnum_vulcan_key()
197
#             key=get_sort_dict_alnum_vulcan_key()
171
#         )).items():
198
#         )).items():
172
#             print(phrase, "=", data)
199
#             print(phrase, "=", data)
173
#     except BrokenPipeError:
200
#     except BrokenPipeError:
174
#         pass
201
#         pass
175
202
176
    text = argv[1]
-
 
177
    sentences = findall(r'(?!\s+)(?:.+?\.{1,3}|.+$)', text, DOTALL)
203
    sentences = findall(r'(?!\s+)(?:.+?\.{1,3}|.+$)', text, DOTALL)
-
 
204
    dmsg("sentences:", sentences, min_level=2)
178
    for sentence in sentences:
205
    for sentence in sentences:
-
 
206
        dmsg("sentence:", sentence, min_level=2)
-
 
207
-
 
208
        clauses = split(r'\s+[-–—]\s+', sentence)
-
 
209
        dmsg("clauses:", clauses, min_level=2)
-
 
210
        for clause in clauses:
-
 
211
            dmsg("clause:", clause, min_level=2)
-
 
212
-
 
213
            words = findall(r'[^\s.]+', clause)
-
 
214
            dmsg("words:", words, min_level=2)
-
 
215
179
        print(sentence)
216
            offset = 0
-
 
217
            while offset < len(words):
-
 
218
                translation = None
-
 
219
-
 
220
                for i in reversed(range(offset + 1, len(words) + 1)):
-
 
221
                    phrase = ' '.join(words[offset:i])
180
222
181
        words = findall(r"(?!\s+)[a-z'-]{2,}", sentence, IGNORECASE)
223
                    dmsg("phrase:", phrase, min_level=2)
182
        print(words)
-
 
183
224
184
        translated_words = list(map(translate, words))
225
                    translation = translate(phrase)
185
        print(translated_words)
-
 
186
226
187
        for index, word in enumerate(words):
227
                    if translation is not None:
188
            sentence = sentence.replace(word, str(translated_words[index]))
228
                        dmsg("phrase-translation:", translation, min_level=2)
-
 
229
                        dmsg("words[{0}:{1}] = [\"{2}\"]".format(offset, i, translation), min_level=2)
-
 
230
                        words[offset:i] = [translation]
-
 
231
                        offset += i - 1
189
        print(sentence)
232
                        break
190
233
191
        # replace punctuation
234
                if translation is None:
192
        for symbol, replacement in ({" - ": ", "}).items():
235
                    dmsg("phrase-translation:", translation, min_level=2)
193
            sentence = sentence.replace(symbol, replacement)
236
                    offset += 1
194
237
195
        print(sentence)
238
            dmsg("words-translation:", words, min_level=2)
196
 
239