#!/usr/bin/python3 import argparse import Levenshtein import os import regex as re import secrets import shutil import subprocess import sys parser = argparse.ArgumentParser(prog='kal-spell-heur', description='Attempts speller and other heuristic analysis for missing analyses') parser.add_argument('-s', '--speller-bin', action='store', help='Path to hfst-ospell-office; defaults to searching $PATH', default='') parser.add_argument('-S', '--speller-fst', action='store', help='Path to kl.zhfst; defaults to searching install paths', default='') parser.add_argument('-t', '--tokenise-bin', action='store', help='Path to hfst-tokenise; defaults to searching $PATH', default='') parser.add_argument('-T', '--tokenise-fst', action='store', help='Path to tokeniser-disamb-gt-desc.pmhfst; defaults to searching install paths', default='') parser.add_argument('-D', '--debug', action='store_true', help='Enable Python stack traces and other debugging', default=False) cmdargs = parser.parse_args() if not cmdargs.debug: sys.tracebacklimit = 0 g_prefix = '/usr' g_dir = os.path.dirname(os.path.abspath(__file__)) # hfst-ospell-office bins = [cmdargs.speller_bin, shutil.which('hfst-ospell-office'), '/usr/local/bin/hfst-ospell-office', '/opt/local/bin/hfst-ospell-office', '/usr/bin/hfst-ospell-office'] bin_spell = ''; for f in bins: if f and os.access(f, os.X_OK): bin_spell = f break if not bin_spell: raise RuntimeError('No usable hfst-ospell-office given or found!') if cmdargs.debug: print(f"kal-spell-heur: Using {bin_spell}", file=sys.stderr, flush=True) # hfst-tokenise bins = [cmdargs.tokenise_bin, shutil.which('hfst-tokenise'), '/usr/local/bin/hfst-tokenise', '/opt/local/bin/hfst-tokenise', '/usr/bin/hfst-tokenise'] bin_tok = ''; for f in bins: if f and os.access(f, os.X_OK): bin_tok = f break if not bin_tok: raise RuntimeError('No usable hfst-tokenise given or found!') if cmdargs.debug: print(f"kal-spell-heur: Using {bin_tok}", file=sys.stderr, flush=True) # kl.zhfst fsts = [ '', '${prefix}/share/voikko/3/kl.zhfst'.replace('${prefix}', g_prefix), '/usr/local/share/voikko/3/kl.zhfst', '/usr/share/voikko/3/kl.zhfst', f'{g_dir}/../spellcheckers/3/kl.zhfst', ] fst_spell = '' for f in fsts: if f and os.path.exists(f) and os.path.getsize(f): fst_spell = f break if not fst_spell: raise RuntimeError('No usable kl.zhfst given or found!') if cmdargs.debug: print(f"kal-spell-heur: Using {fst_spell}", file=sys.stderr, flush=True) # tokeniser-disamb-gt-desc.pmhfst fsts = [ '', '${prefix}/share/giella/kal/tokeniser-disamb-gt-desc.pmhfst'.replace('${prefix}', g_prefix), '/usr/local/share/giella/kal/tokeniser-disamb-gt-desc.pmhfst', '/usr/share/giella/kal/tokeniser-disamb-gt-desc.pmhfst', f'{g_dir}/../tokenisers/tokeniser-disamb-gt-desc.pmhfst', ] fst_tok = '' for f in fsts: if f and os.path.exists(f) and os.path.getsize(f): fst_tok = f break if not fst_tok: raise RuntimeError('No usable tokeniser.pmhfst given or found!') if cmdargs.debug: print(f"kal-spell-heur: Using {fst_tok}", file=sys.stderr, flush=True) spell = subprocess.Popen([bin_spell, '-T', '-d', fst_spell], bufsize=1, stdin=subprocess.PIPE, stdout=subprocess.PIPE, encoding='UTF-8', text=True) tok = subprocess.Popen([bin_tok, '-L', fst_tok], bufsize=1, stdin=subprocess.PIPE, stdout=subprocess.PIPE, encoding='UTF-8', text=True) while g := spell.stdout.readline(): g = g.strip() if g == '@@ hfst-ospell-office is alive': break if g.startswith('@@ '): continue raise RuntimeError(f'Speller gave unexpected greeting: {g}') cohorts = [] cohort = { 'p': '', 'w': '', 'rs': [], 's': '', } old_words = { 'b': set(['baaja', 'baalia', 'baaliar', 'bajeri', 'biibili', 'biili', 'biiler', 'bussi']), 'd': set(['diaavulu', 'decembari']), 'f': set(['farisiiari', 'februaari', 'feeria', 'feeriar', 'freer']), 'g': set(['gassi', 'guuti']), 'h': set(['hiisti', 'horaa', 'horaartor', 'huaa', 'huaartor']), 'j': set(['januaari', 'joorli', 'joorlisior', 'jorngoq', 'juuli', 'juulli', 'juumooq', 'juuni', 'juuti']), 'l': set(['laaja', 'lakker', 'lakki', "lal'laaq", 'lappi', 'liimmer', 'liimmi']), 'r': set(['raaja', 'rinngi', 'rommi', 'russeq', 'ruua', 'ruujori', 'ruusa', 'ruusaar']), 'v': set(['viinnequt', 'viinni']), } spell_cache = {} tok_cache = {} def do_spell(token): global spell_cache if token in spell_cache: return spell_cache[token] spell.stdin.write(f'5 {token}\n') out = spell.stdout.readline() rv = {} if out.startswith('&'): sgs = out[2:].strip().split('\t') for sg in sgs: m = re.match(r'^(.+?) \(([\d.]+);\d+\)$', sg.strip()) rv[m[1]] = m[2] if len(spell_cache) >= 20000: spell_cache.clear() spell_cache[token] = rv return rv def do_tokenise(token): global tok_cache if token in tok_cache: return tok_cache[token] nonce = secrets.token_urlsafe(4) tok.stdin.write(f'{token}\n\n') rv = { 'w': '', 'rs': [], } while (l := tok.stdout.readline()) != f'\n': if l.startswith('"<'): rv['w'] = l elif l.startswith('\t"'): if '" ?' in l: rv = { 'w': '', 'rs': [], } break rv['rs'].append(l) if len(tok_cache) >= 20000: tok_cache.clear() tok_cache[token] = rv return rv def kal_from(token): token = token.lower() rv = 0 first = token[0] if first in old_words: if token in old_words[first]: return 0 rv = 1 if not re.match(r'[aeikmnopqstu]', first): rv = max(rv, 1) if re.match(r'^.[bcdwxyzæøå]', token): rv = max(rv, 2) # ToDo: 'ai' anywhere but the end signifies non-kal if re.match(r'^.ai', token): rv = max(rv, 3) eorq = re.finditer(r'[eo]+[^eorq]', token) for m in eorq: rv = max(rv, m.start()+2) if not re.match(r'[aikpqtu]', token[-1]): rv = max(rv, len(token)) if m := re.match(r'[^aefgijklmnopqrstuvŋ][aefgijklmnopqrstuvŋ]+$', token): rv = max(rv, m.start()+1) cons = re.finditer(r'([qwrtpsdfghjklzxcvbnmŋ])([qwrtpsdfghjklzxcvbnmŋ])', token) for m in cons: if m[1] == 'r': continue if m[1] == 'n' and m[2] == 'g': continue if m[1] == 't' and m[2] == 's': continue if m[1] != m[2]: rv = max(rv, m.start()+2) return rv def kal_segment(token): if not re.match(r'^[a-zæøåŋ]+$', token, flags=re.I): return token token = re.sub(r'nng', 'ŋŋ', token) token = re.sub(r'ng', 'ŋ', token) C = re.compile(r'[bcdfghjklmnŋpqrstvwxz]', flags=re.I) V = re.compile(r'[aeiouyæøå]', flags=re.I) i = 0 split = '' while True: split += token[i] if len(token) >= i+3 and V.match(token[i]) and C.match(token[i+1]) and V.match(token[i+2]): split += ' ' elif len(token) >= i+4 and V.match(token[i]) and C.match(token[i+1]) and C.match(token[i+2]) and V.match(token[i+3]): i += 1 split += token[i] split += ' ' elif len(token) >= i+2 and token[i].lower() != token[i+1].lower() and V.match(token[i]) and V.match(token[i+1]): split += ' ' i += 1 if i >= len(token)-2: break split += token[i:] return split.replace('ŋ ŋ', 'n ng').replace('ŋŋ', 'nng').replace('ŋ', 'ng').split(' ') def print_cohort(c): if c['p']: print(c['p'], end='') if c['w']: print(c['w'], end='') for r in c['rs']: print(r, end='') if c['s']: print(c['s'], end='') def process(): global cohorts, cohort errs = [False]*len(cohorts) for i,c in enumerate(cohorts): for r in c['rs']: if '" ?' in r: errs[i] = True for i,c in enumerate(cohorts): if not errs[i]: continue w = c['w'][2:-3] rs = [] sgs = do_spell(w) for k,v in sgs.items(): d = Levenshtein.distance(w, k) nc = do_tokenise(k) for r in nc['rs']: r = r.rstrip() rs.append(f'{r} Heur/Spell \n') n = kal_from(w) nw = w[0:n] sgs = kal_segment(w[n:]) for si in range(len(sgs)): rem = ''.join(sgs[si:]) nc = None if w[0].isupper(): nc = do_tokenise('Xxx' + rem) else: nc = do_tokenise('xxx' + rem) if nc['rs']: ncw = nc['w'][2:-3] for r in nc['rs']: if ' Prop' not in r and ' iProp' not in r: continue r = re.sub(r'\t"Xxx', f'\t"{nw}', r.rstrip(), flags=re.I) d = len(w) - len(rem) rs.append(f'{r} Heur/Xxx \n') if rs: break nw += sgs[si] if rs: c['rs'] = rs for i,c in enumerate(cohorts): print_cohort(c) cohorts = [] cohort = { 'p': '', 'w': '', 'rs': [], 's': '', } for line in sys.stdin: #line = line.rstrip() if line.startswith('"<'): if cohort['w']: cohorts.append(cohort) if cohorts[-1]['w'] == '"<¶>"': process() cohort = { 'p': '', 'w': '', 'rs': [], 's': '', } cohort['w'] = line elif line.startswith('\t"'): if cohort['w']: cohort['rs'].append(line) else: cohort['p'] += line elif line.startswith(''): cohorts.append(cohort) process() cohort['p'] = line else: if cohort['w']: cohort['s'] += line else: cohort['p'] += line cohorts.append(cohort) process()