#!/usr/bin/perl # -*- mode: cperl; indent-tabs-mode: nil; tab-width: 3; cperl-indent-level: 3; -*- use strict; use warnings; use utf8; BEGIN { $| = 1; binmode(STDIN, ':encoding(UTF-8)'); binmode(STDOUT, ':encoding(UTF-8)'); } use open qw( :encoding(UTF-8) :std ); use feature 'unicode_strings'; # Prefixes from root.lexc ca. lines 14-15 my @p = qw(AA TA); # Main POS tags from root.lexc ca. lines 19-29 my @m = qw(N V Pali Conj Adv Interj Pron Prop Num Symbol); # List of tags that should block Gram/TV and Gram/IV propagation my @v_block = qw( NIQAR HTR NAR GUMINAR SIMA TARIAQAR UTE SAAR SAR TIP NIRAR SURE SUGE GASUGE GASURE NASUGE NASURE TSAALI TIR TITIR QQU ); my $pp = join('|', @p); my $mp = join('|', @m); my $vb = join('|', @v_block); sub propagate_tags { my ($c) = @_; if ($c !~ m@^\s+"@ || $c !~ m@Gram/[TI]V@) { return $c; } # Snip duplicate tags my $o = $c; do { $o = $c; $c =~ s@ (Gram/[HTI]V) \1@ $1@g; $c =~ s@ (Gram/[HTI]V) (Der/[nv][nv]) \1@ $2 $1@g; } while ($o ne $c); my @ts = split(/ (?=(?:(?:i?(?:$mp))|(?:\p{Lu}[_\p{Lu}]+)|U)(?: |$))/, $c); my $gram = ''; foreach (@ts) { # Stop entirely at hybrid marker, to avoid mangling 2nd part of a hybrid if (m@ Gram/Hyb@) { last; } # Don't propagate past POS, blocking morphemes, Pass/Refl, or noun morphemes if (m/^(?:$mp)(?: |$)/ || m/^(?:$vb)(?: |$)/ || m@ Gram/(Pass|Refl)@ || m@ Der/[nv]n@) { $gram = ''; next; } if (m@ (Gram/[TI]V)@) { $gram = $1; } elsif ($gram && m@ Der/[nv]v@) { $_ .= " $gram"; } } return join(' ', @ts); } while () { chomp; # Move prefixes after the base form $_ =~ s@^(\s+)($pp) (".+?" )@$1$3Prefix/$2 @mg; $_ = propagate_tags($_); print "$_\n"; }