Subversion Repositories LCARS

Rev

Rev 291 | Rev 293 | Go to most recent revision | Show entire file | Regard whitespace | Details | Blame | Last modification | View Log | RSS feed

Rev 291 Rev 292
Line -... Line 1...
-
 
1
#!/usr/bin/env python3
-
 
2
1
'''
3
'''
2
Created on 2014-10-20
4
Created on 2014-10-20
3

5

4
@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
6
@author: Thomas 'PointedEars' Lahn <mail@PointedEars.de>
5
'''
7
'''
6
from sys import argv, stderr
8
from sys import argv, stderr
7
from re import findall, DOTALL, IGNORECASE, match, sub, compile, \
9
from re import findall, DOTALL, IGNORECASE, match, sub, compile, \
8
    split
10
    split
9
from os import chdir
11
from os import chdir, stat
10
from os.path import dirname, realpath
12
from os.path import dirname, realpath, basename
11
from collections import OrderedDict
13
from collections import OrderedDict
12
from functools import cmp_to_key
14
from functools import cmp_to_key
13
from copy import deepcopy
15
from copy import deepcopy
14
from collections.abc import MutableSequence
16
from collections.abc import MutableSequence
-
 
17
from pickle import dump, load
15
18
-
 
19
debug_level = 2
16
dictionary = {}
20
dictionary = {}
17
21
18
prepositions = {
22
prepositions = {
19
    "fi'": 'on',
23
    "fi'": 'on',
20
    "na'": 'at|to',
24
    "na'": 'at|to',
21
    "t'": 'of'
25
    "t'": 'of'
22
}
26
}
23
27
-
 
28
def dmsg(*args, **kwargs):
-
 
29
    if not hasattr(kwargs, 'min_level') or kwargs['min_level'] is None:
-
 
30
        kwargs['min_level'] = 1
-
 
31
-
 
32
    if not hasattr(kwargs, 'file'):
-
 
33
        kwargs['file'] = stderr
-
 
34
-
 
35
    if debug_level >= kwargs['min_level']:
-
 
36
        del kwargs['min_level']
-
 
37
        print(*args, **kwargs)
-
 
38
24
class MutableString2(MutableSequence):
39
class MutableString2(MutableSequence):
25
    def __init__(self, value=None):
40
    def __init__(self, value=None):
26
        self._values = [str(value)] if value is not None else []
41
        self._values = [str(value)] if value is not None else []
27
42
28
    def __add__(self, value):
43
    def __add__(self, value):
Line 50... Line 65...
50
        self._values.append(values)
65
        self._values.append(values)
51
66
52
    def insert(self, index, value):
67
    def insert(self, index, value):
53
        raise NotImplementedError
68
        raise NotImplementedError
54
69
-
 
70
def cli_help():
-
 
71
    print('Usage: {0} TEXT...'.format(basename(argv[0])))
-
 
72
55
def load_dictionary(dictionary, dictionary_file):
73
def load_dictionary(dictionary, dictionary_file):
56
    print('Loading dictionary {0} ...'.format(dictionary_file), end='', file=stderr)
74
    dmsg('Loading dictionary '.format(dictionary_file), end='', min_level=1)
57
75
58
    chdir(dirname(realpath(__file__)))
76
    chdir(dirname(realpath(__file__)))
-
 
77
-
 
78
    pickle_file = basename(dictionary_file) + '.pickle'
-
 
79
-
 
80
    try:
-
 
81
        pickle_mtime = stat(pickle_file).st_mtime
-
 
82
    except FileNotFoundError:
-
 
83
        pickle_mtime = None
-
 
84
-
 
85
    if pickle_mtime is None or stat(dictionary_file).st_mtime > pickle_mtime:
-
 
86
        dmsg('from {0} ...'.format(dictionary_file), end='', min_level=1)
59
    with open(dictionary_file) as f:
87
        with open(dictionary_file) as f:
60
        keys = "ipa|en|lit|pos|com|tag|ex"
88
            keys = "ipa|en|lit|pos|com|tag|ex"
61
        indent = None
89
            indent = None
62
        value = None
90
            value = None
63
91
Line 66... Line 94...
66
            if m is not None:
94
                if m is not None:
67
                phrase = m.group("phrase")
95
                    phrase = m.group("phrase")
68
                dictionary[phrase] = {}
96
                    dictionary[phrase] = {}
69
                indent = None
97
                    indent = None
70
            else:
98
                else:
71
                m = match(
-
 
72
                    r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(keys),
99
                    m = match(r'(?P<indent>\s*)(?P<key>{0}):\s*(?P<value>.+)'.format(keys), line)
73
                    line)
-
 
74
                if m is not None:
100
                    if m is not None:
75
                    indent = m.group("indent")
101
                        indent = m.group("indent")
76
                    key = m.group("key")
102
                        key = m.group("key")
77
                    value = m.group("value")
103
                        value = m.group("value")
78
                    value = dictionary[phrase][key] = MutableString2(value)
104
                        value = dictionary[phrase][key] = MutableString2(value)
Line 80... Line 106...
80
                    m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
106
                        m = match(r'(?P<indent>\s+)(?P<continuation>\S.*)', line)
81
                    if m is not None:
107
                        if m is not None:
82
                        if len(m.group("indent")) == len(indent) + 2:
108
                            if len(m.group("indent")) == len(indent) + 2:
83
                            dictionary[phrase][key] += (" " + m.group("continuation"))
109
                                dictionary[phrase][key] += (" " + m.group("continuation"))
84
110
-
 
111
        dmsg('\nSaving pickle {0} ...'.format(pickle_file), end='', min_level=1)
-
 
112
        # TODO: Pickle should only contain strings to be small
-
 
113
        with open(pickle_file, mode='wb') as f: dump(dictionary, f)
-
 
114
        dmsg(' done.', min_level=1)
-
 
115
    else:
-
 
116
        dmsg('from {0} ...'.format(pickle_file), end='', min_level=1)
-
 
117
        with open(pickle_file, mode='rb') as f: pickle = load(f)
-
 
118
        for key, value in pickle.items():
-
 
119
            dictionary[key] = value
-
 
120
85
    print(' done ({0} entries).'.format(len(dictionary)), file=stderr)
121
    dmsg(' done ({0} entries).'.format(len(dictionary)), min_level=1)
86
122
87
def clean_dictionary(dictionary):
123
def clean_dictionary(dictionary):
88
    braces_re = compile(r'^\s*\{(.+)\}\s*$')
124
    braces_re = compile(r'^\s*\{(.+)\}\s*$')
89
    semicolon_re = compile(r'\s*;\s*')
125
    semicolon_re = compile(r'\s*;\s*')
90
126
Line 135... Line 171...
135
171
136
        return 1 if len(b) < len(a) else 0
172
        return 1 if len(b) < len(a) else 0
137
173
138
    return cmp_to_key(sort_dict_alnum_vulcan)
174
    return cmp_to_key(sort_dict_alnum_vulcan)
139
175
140
def translate (word, recursion=False):
176
def translate (phrase):
141
    translation = dictionary.get(word.lower(), None)
177
    translation = dictionary.get(phrase.lower(), None)
142
    if translation is not None:
178
    if translation is not None:
143
        translation = translation["en"]
-
 
144
        if match('[A-Z]', word):
-
 
145
            return sub('[a-z]', lambda ch: ch.group(0).upper(), str(translation), count=1)
-
 
146
        return translation
179
        return translation
147
180
148
    if not recursion:
-
 
149
        # prepositions attached?
-
 
150
        for prep, prep_transl in prepositions.items():
-
 
151
            if (match(prep, word)):
-
 
152
                real_word = word.replace(r'^' + prep, '')
-
 
153
                real_word_transl = translate(real_word, recursion=True)
-
 
154
                if real_word_transl is not None:
-
 
155
                    return prep_transl + ' ' + real_word_transl
-
 
156
-
 
157
    if recursion:
-
 
158
        return None
181
    return None
159
    else:
-
 
160
        # Not in dictionary: proper name or missing for other reasons
-
 
161
        return '{{{0}}}'.format(word)
-
 
162
182
163
if __name__ == '__main__':
183
if __name__ == '__main__':
-
 
184
    if len(argv) < 2:
-
 
185
        print('Nothing to translate.', end='\n\n', file=stderr)
-
 
186
        cli_help()
-
 
187
        exit(1)
-
 
188
-
 
189
    text = argv[1]
-
 
190
164
    load_dictionary(dictionary, 'vuh-gol-en.dict.zdb.txt')
191
    load_dictionary(dictionary, 'vuh-gol-en.dict.zdb.txt')
165
    clean_dictionary(dictionary)
192
    clean_dictionary(dictionary)
166
193
167
#     try:
194
#     try:
168
#         for phrase, data in OrderedDict(sorted(
195
#         for phrase, data in OrderedDict(sorted(
Line 171... Line 198...
171
#         )).items():
198
#         )).items():
172
#             print(phrase, "=", data)
199
#             print(phrase, "=", data)
173
#     except BrokenPipeError:
200
#     except BrokenPipeError:
174
#         pass
201
#         pass
175
202
176
    text = argv[1]
-
 
177
    sentences = findall(r'(?!\s+)(?:.+?\.{1,3}|.+$)', text, DOTALL)
203
    sentences = findall(r'(?!\s+)(?:.+?\.{1,3}|.+$)', text, DOTALL)
-
 
204
    dmsg("sentences:", sentences, min_level=2)
178
    for sentence in sentences:
205
    for sentence in sentences:
179
        print(sentence)
206
        dmsg("sentence:", sentence, min_level=2)
180
207
181
        words = findall(r"(?!\s+)[a-z'-]{2,}", sentence, IGNORECASE)
208
        clauses = split(r'\s+[-–—]\s+', sentence)
-
 
209
        dmsg("clauses:", clauses, min_level=2)
182
        print(words)
210
        for clause in clauses:
-
 
211
            dmsg("clause:", clause, min_level=2)
183
212
184
        translated_words = list(map(translate, words))
213
            words = findall(r'[^\s.]+', clause)
185
        print(translated_words)
214
            dmsg("words:", words, min_level=2)
186
215
187
        for index, word in enumerate(words):
216
            offset = 0
188
            sentence = sentence.replace(word, str(translated_words[index]))
217
            while offset < len(words):
189
        print(sentence)
218
                translation = None
190
219
-
 
220
                for i in reversed(range(offset + 1, len(words) + 1)):
-
 
221
                    phrase = ' '.join(words[offset:i])
-
 
222
-
 
223
                    dmsg("phrase:", phrase, min_level=2)
-
 
224
-
 
225
                    translation = translate(phrase)
-
 
226
-
 
227
                    if translation is not None:
-
 
228
                        dmsg("phrase-translation:", translation, min_level=2)
-
 
229
                        dmsg("words[{0}:{1}] = [\"{2}\"]".format(offset, i, translation), min_level=2)
-
 
230
                        words[offset:i] = [translation]
-
 
231
                        offset += i - 1
191
        # replace punctuation
232
                        break
-
 
233
192
        for symbol, replacement in ({" - ": ", "}).items():
234
                if translation is None:
193
            sentence = sentence.replace(symbol, replacement)
235
                    dmsg("phrase-translation:", translation, min_level=2)
-
 
236
                    offset += 1
194
237
195
        print(sentence)
238
            dmsg("words-translation:", words, min_level=2)