conv-palataliz

   1 #!/bin/bash
   2 # -*- mode: sh; coding: utf-8 -*-
   3 # $Date: 2008/03/01 03:57:48 $
   4
   5 # Do the matching for Latin alphabet (English), but in the UTF-8 encoding (for wide characters).
   6 LC_ALL=POSIX.UTF8 sed -e "
   7 ##########################################
   8 # Well-formedness of the input:
   9 # - no palatalization signs on vowels;
  10 # - no ' on non-Latin chars
  11 #   (though, ˣ is allowed; brackets and digits also are allowed).
  12 /[auiyeo]'\|[^][)(}{[:digit:][:lower:]ˣ]'/ { w /dev/stderr
  13 q 57;} # (mostly GNU sed extensions)
  14 #
  15 # BTW, digits with primes shouldn't be converted (unless there is a special desire).
  16 ##########################################
  17 # Actual code:
  18 s,\(^\|[^[:digit:]]\)',\1ʹ,g # Converted to U+02B9 MODIFIER LETTER PRIME (transliteration of mjagkij znak (Cyrillic soft sign: palatalization))
  19 # FIXME: perhaps apply only to Latin+apostroph; apostrophs can be used elsewhere? (Or one should use the correct quoting character there?)
  20 "
  21
  22 exitCode="$?"
  23 if [[ "$exitCode" == 57 ]]; then
  24     echo $"-- error: non well-formed input (' after a vowel or another disallowed char)!" >/dev/stderr
  25 fi
  26 exit "$exitCode"
  27