gdb/contrib/words.sh

   1 #!/bin/sh
   2
   3 # Copyright (C) 2019-2023 Free Software Foundation, Inc.
   4 # This program is free software; you can redistribute it and/or modify
   5 # it under the terms of the GNU General Public License as published by
   6 # the Free Software Foundation; either version 3 of the License, or
   7 # (at your option) any later version.
   8 #
   9 # This program is distributed in the hope that it will be useful,
  10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 # GNU General Public License for more details.
  13 #
  14 # You should have received a copy of the GNU General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17 # This script intends to facilitate spell checking of source/doc files.
  18 # It:
  19 # - transforms the files into a list of lowercase words
  20 # - prefixes each word with the frequency
  21 # - filters out words within a frequency range
  22 # - sorts the words, longest first
  23 #
  24 # If '-c' is passed as option, it operates on the C comments only, rather than
  25 # on the entire file.
  26 #
  27 # For:
  28 # ...
  29 # $ files=$(find gdb -type f -name "*.c" -o -name "*.h")
  30 # $ ./gdb/contrib/words.sh -c $files
  31 # ...
  32 # it generates a list of ~15000 words prefixed with frequency.
  33 #
  34 # This could be used to generate a dictionary that is kept as part of the
  35 # sources, against which new code can be checked, generating a warning or
  36 # error.  The hope is that misspellings would trigger this frequently, and rare
  37 # words rarely, otherwise the burden of updating the dictionary would be too
  38 # much.
  39 #
  40 # And for:
  41 # ...
  42 # $ files=$(find gdb -type f -name "*.c" -o -name "*.h")
  43 # $ ./gdb/contrib/words.sh -c -f 1 $files
  44 # ...
  45 # it generates a list of ~5000 words with frequency 1.
  46 #
  47 # This can be used to scan for misspellings manually.
  48 #
  49
  50 minfreq=
  51 maxfreq=
  52 c=false
  53 while [ $# -gt 0 ]; do
  54     case "$1" in
  55         -c)
  56             c=true
  57             shift
  58             ;;
  59         --freq|-f)
  60             minfreq=$2
  61             maxfreq=$2
  62             shift 2
  63             ;;
  64         --min)
  65             minfreq=$2
  66             if [ "$maxfreq" = "" ]; then
  67                 maxfreq=0
  68             fi
  69             shift 2
  70             ;;
  71         --max)
  72             maxfreq=$2
  73             if [ "$minfreq" = "" ]; then
  74                 minfreq=0
  75             fi
  76             shift 2
  77             ;;
  78         *)
  79             break;
  80             ;;
  81     esac
  82 done
  83
  84 if [ "$minfreq" = "" ] && [ "$maxfreq" = "" ]; then
  85     minfreq=0
  86     maxfreq=0
  87 fi
  88
  89 awkfile=$(mktemp)
  90 trap 'rm -f "$awkfile"' EXIT
  91
  92 cat > "$awkfile" <<EOF
  93 BEGIN {
  94     in_comment=0
  95 }
  96
  97 // {
  98     line=\$0
  99 }
 100
 101 /\/\*/ {
 102     in_comment=1
 103     sub(/.*\/\*/, "", line)
 104 }
 105
 106 /\*\// {
 107     sub(/\*\/.*/, "", line)
 108     in_comment=0
 109     print line
 110     next
 111 }
 112
 113 // {
 114     if (in_comment) {
 115         print line
 116     }
 117 }
 118 EOF
 119
 120 # Stabilize sort.
 121 export LC_ALL=C
 122
 123 if $c; then
 124     awk \
 125         -f "$awkfile" \
 126         -- "$@"
 127 else
 128     cat "$@"
 129 fi \
 130     | sed \
 131           -e 's/[!"?;:%^$~#{}`&=@,. \t\/_()|<>\+\*-]/\n/g' \
 132           -e 's/\[/\n/g' \
 133           -e 's/\]/\n/g' \
 134           -e "s/'/\n/g" \
 135           -e 's/[0-9][0-9]*/\n/g' \
 136           -e 's/[ \t]*//g' \
 137     | tr '[:upper:]' '[:lower:]' \
 138     | sort \
 139     | uniq -c \
 140     | awk "{ if (($minfreq == 0 || $minfreq <= \$1) \
 141                  && ($maxfreq == 0 || \$1 <= $maxfreq)) { print \$0; } }" \
 142     | awk '{ print length($0) " " $0; }' \
 143     | sort -n -r \
 144     | cut -d ' ' -f 2-