Do not convert by default primes after digits.
[lingdata-utils.git] / conv-palataliz
blobd0571f2d525fe1b426b280caa6bc59d6aede5053
1 #!/bin/bash
2 # -*- mode: sh; coding: utf-8 -*-
3 # $Date: 2008/03/01 03:57:48 $
5 # Do the matching for Latin alphabet (English), but in the UTF-8 encoding (for wide characters).
6 LC_ALL=POSIX.UTF8 sed -e "
7 ##########################################
8 # Well-formedness of the input:
9 # - no palatalization signs on vowels;
10 # - no ' on non-Latin chars
11 # (though, ˣ is allowed; brackets and digits also are allowed).
12 /[auiyeo]'\|[^][)(}{[:digit:][:lower:]ˣ]'/ { w /dev/stderr
13 q 57;} # (mostly GNU sed extensions)
15 # BTW, digits with primes shouldn't be converted (unless there is a special desire).
16 ##########################################
17 # Actual code:
18 s,\(^\|[^[:digit:]]\)',\1ʹ,g # Converted to U+02B9 MODIFIER LETTER PRIME (transliteration of mjagkij znak (Cyrillic soft sign: palatalization))
19 # FIXME: perhaps apply only to Latin+apostroph; apostrophs can be used elsewhere? (Or one should use the correct quoting character there?)
22 exitCode="$?"
23 if [[ "$exitCode" == 57 ]]; then
24 echo $"-- error: non well-formed input (' after a vowel or another disallowed char)!" >/dev/stderr
26 exit "$exitCode"