#!/usr/bin/python3 """GiellaLT tests for multichars in lexc.""" import re import sys from argparse import ArgumentParser from time import time def main(): """CLI for GiellaLT lemma generation tests.""" argp = ArgumentParser() argp.add_argument("-l", "--lexc", type=open, dest="lexcfile", help="read multichars from lexc file", required=True) argp.add_argument("-d", "--debug", action="store_true", default=False, help="prints debugging outputs") argp.add_argument("-v", "--verbose", action="store_true", default=False, help="prints some outputs") options = argp.parse_args() start = time() inmultichars = False inlexicons = False failcount = 0 declaredmultichars = set() lines = 0 plussuffixtags = False plussuffixtagre = r"\+[A-Za-z0-9_-][^+@#]*" prefixplustags = False prefixplustagre = r"[^+@#]*[A-Za-z0-9_-]\+" atflagattags = False atflagatre = r"@[^@]+@" for line in options.lexcfile: lines += 1 if not inmultichars and not inlexicons: if line.startswith("Multichar_Symbols"): inmultichars = True rest = line[len("Multichar_Symbols") + 1:].strip() if "!" in rest: rest = rest.split("!")[0].strip() if rest != "": print(f"trailing rubbish after multichar syms: {rest}") failcount += 1 elif line.startswith("LEXICON"): print(f"found lexicons before multichars: {line.strip()}") failcount += 1 inlexicons = True else: continue elif inmultichars and not inlexicons: if line.startswith("LEXICON "): inmultichars = False inlexicons = True continue if "!" in line: line = line.replace("%!", "§EXCLAMATION§") line = line.split("!")[0] line = line.replace("§EXCLAMATION§", "%!") line = line.replace("% ", "§SPACE§") for multichar in line.split(): multichar = multichar.replace("§SPACE§", "% ") declaredmultichars.add(multichar) if multichar.startswith("+"): plussuffixtags = True if multichar.endswith("+"): prefixplustags = True if multichar.startswith("@") and multichar.endswith("@"): atflagattags = True elif not inmultichars and inlexicons: if ";" in line: # for some reason split is confused by tabs and spaces mixed line = line.replace("\t", " ") if "!" in line: line = line.replace("%!", "§EXCLAMATION§") line = line.split("!")[0] line = line.replace("§EXCLAMATION§", "%!") if "\"" in line: line = line.replace("%\"", "§QUOTATION§") line = re.sub(" \"[^\"]*\"", "", line) line = line.replace("§QUOTATION§", "\"") if "<" in line: line = line.replace("%<", "§LESSTHAN§") line = line.replace("%>", "§MORETHAN§") line = re.sub("<[^>]*>", "§REGEX§", line) line = line.replace("§LESSTHAN§", "%<") line = line.replace("§MORETHAN§", "%>") elif ">" in line: line = line.replace("%>", "§MORETHAN§") if ">" in line: line = re.sub("^[^>]*>", "§REGEXFAIL", line) line = line.replace("§MORETHAN§", "%>") line = line.replace("%;", "§SEMICOLON§") if line.count(";") > 1: print(f"too many semicolons on line {lines}:\n{line}") failcount += 1 continue line = line.replace("§SEMICOLON§", "%;") if "% " in line: line = line.replace("% ", "§SPACE§") pairstring = None fields = line.split() for i, field in enumerate(fields): if field == ";": if i >= 1: cont = fields[i-1] if i >= 2: pairstring = fields[i-2] if i >= 3: if line.startswith("LEXICON "): print("entries on LEXICON line is not " f"supported:\n{line}") else: print(f"too many spaces? parsing:\n{line}") failcount += 1 if pairstring: if ":" in pairstring: deep = pairstring.split(":")[0] else: deep = pairstring sussufix = [] susprefix = [] for tag in re.findall(plussuffixtagre, deep): if tag not in declaredmultichars: if not tag[-1].isalpha() and \ tag[:-1] in declaredmultichars: continue sussufix.append([tag]) for tag in re.findall(prefixplustagre, deep): if tag not in declaredmultichars: susprefix.append([tag]) if sussufix and plussuffixtags: if not prefixplustags: print(f"{sussufix} seem(s) like a multichar " "suffix tag but is missing from the " "Multichar_Symbols section " f"on line {lines}:\n{line}") failcount += 1 elif susprefix: print(f"{sussufix} or {susprefix} seem like " "potential multichars (suffixes or prefixes?)" " but are missing from " "Multichar_Symbols section " f"on line {lines}:\n{line}") failcount += 1 elif susprefix and prefixplustags: if not plussuffixtags: print(f"{susprefix} seem(s) like a multichar " "prefix tag but is missing from the " "Multichar_Symbols section " f"on line {lines}:\n{line}") failcount += 1 elif sussufix: print(f"{susprefix} seem(s) like a multichar " "prefix tag but is missing from the " "Multichar_Symbols section " f"on line {lines}:\n{line}") failcount += 1 for flag in re.findall(atflagatre, deep): if flag not in declaredmultichars: print(f"{flag} seems like a multichar " "flag diacritic but is missing from the " "Multichar_Symbols section " f"on line {lines}:\n{line}") failcount += 1 if len(fields) <= 2: # continuation class and ; continue end = time() print(f"Used {end - start} times") if lines == 0: print(f"SKIP: could not find multichars in {options.lexcfile.name}") sys.exit(77) if failcount > 0: print("FAIL: there were problems (see above).") sys.exit(1) if __name__ == "__main__": main()