1 #!/bin/sh
2
3 #**************************************************************************
4 #* *
5 #* OCaml *
6 #* *
7 #* Damien Doligez, projet Gallium, INRIA Rocquencourt *
8 #* *
9 #* Copyright 2012 Institut National de Recherche en Informatique et *
10 #* en Automatique. *
11 #* *
12 #* All rights reserved. This file is distributed under the terms of *
13 #* the GNU Lesser General Public License version 2.1, with the *
14 #* special exception on linking described in the file LICENSE. *
15 #* *
16 #**************************************************************************
17
18 # check-typo - Check typographic conventions on OCaml sources.
19
20 # This program will check files for the following rules:
21
22 # - absence of TAB characters (tab)
23 # - absence of non-ASCII characters (non-ascii)
24 # - absence of non-printing ASCII characters (non-printing)
25 # - absence of white space at end of line (white-at-eol)
26 # - absence of empty lines at end of file (white-at-eof)
27 # - presence of a LF character at the end of the file (missing-lf)
28 # - maximum line length of 80 characters (long-line)
29 # - maximum line length of 132 characters (very-long-line)
30 # - presence of a copyright header (missing-header)
31 # - absence of a leftover "$Id" string (svn-keyword)
32
33 # Exceptions are handled with git attributes: "typo.*".
34 # Its value for a given file is a comma-separated list of rule names,
35 # which lists the rules that should be disabled for this file.
36 # The rule names are the ones shown above in parentheses.
37
38 # Built-in exception:
39 # - Any file git identifies as binary
40 # is automatically exempt from all the rules.
41
42 # ASCII characters are bytes from 0 to 127. Any other byte is
43 # flagged as a non-ASCII character.
44
45 # For the purpose of this tool, printing ASCII characters are:
46 # - the non-white printable ASCII characters (33 to 126)
47 # - TAB (09)
48 # - LF (10)
49 # - SPC (32)
50 # Anything else is flagged as a non-printing ASCII character.
51
52 # This program will recursively explore the files and directories given
53 # on the command line (or by default the current directory), and check
54 # every file therein for compliance to the rules.
55
56 # Directories named .git (and their contents) are always ignored.
57 # This program ignores any file that is not under git control, unless
58 # explicitly given on the command line.
59
60 # If a directory has the git attribute "typo.prune" then it and its contents are
61 # ignored.
62
63 # You can ignore a rule by giving the option -<rule> on the command
64 # line (before any file names).
65
66 # Files which include the utf8 rule will be validated using grep and line-length
67 # computations will take UTF-8 sequences into account. As a special case, UTF-8
68 # sequences are always allowed in the copyright headers.
69
70 # First prevent i18n from messing up everything.
71 export LC_ALL=C
72
73 OCAML_CT_CAT=${OCAML_CT_CAT:-cat}
74 OCAML_CT_LS_FILES=${OCAML_CT_LS_FILES:-git ls-files}
75 OCAML_CT_HEAD=${OCAML_CT_HEAD:-HEAD}
76 OCAML_CT_AWK=${OCAML_CT_AWK:-awk}
77 if [ -z "${OCAML_CT_GIT_INDEX+x}" ] ; then
78 OCAML_CT_GIT_INDEX=
79 else
80 OCAML_CT_GIT_INDEX="GIT_INDEX_FILE=$OCAML_CT_GIT_INDEX"
81 fi
82
83 # The output of processing the attributes should be whitespace-separated with
84 # - the "typo." prefix dropped
85 # - unset/false keys not present
86 # - set/true keys present
87 # - "may" keys present, suffixed by a question mark
88 #
89 # for example,
90 # typo.long-line: set
91 # typo.missing-header: may
92 # typo.very-long-line: false
93 # should result in "long-line missing-header?"
94 get_attrs() {
95 env $OCAML_CT_GIT_INDEX git check-attr --all $OCAML_CT_CA_FLAG "$1" \
96 | grep -o " typo\\..*$" | sed "s/ typo\\.//g" \
97 | grep -v ": unset" | grep -v ": false" \
98 | sed "s/: set//g" | sed "s/: true//g" | sed "s/: may/?/g"
99 }
100
101 # empty if the path is *not* pruned
102 check_prune() {
103 env $OCAML_CT_GIT_INDEX git check-attr typo.prune $OCAML_CT_CA_FLAG "$1" \
104 | grep -v ': unspecified$' | grep -v ': false$'
105 }
106
107 # Special case for recursive call from the find command (see IGNORE_DIRS).
108 case "$1" in
109 --check-prune)
110 case $2 in
111 .git|.git/*)
112 echo "INFO: pruned path $2 (.git)" >&2
113 exit 0;;
114 esac
115 if test -n "$(check_prune "$2")"; then
116 echo "INFO: pruned path $2 (typo.prune)" >&2
117 exit 0
118 fi
119 exit 3;;
120 esac
121
122 case "$1" in
123 --get-attrs)
124 get_attrs "$2"
125 exit 0;;
126 esac
127
128 usage () {
129 echo "usage: check-typo {-<rule>} [--] {<file-or-dir>}" >&2
130 exit 2
131 }
132
133 check_script () {
134 if [ "$($OCAML_CT_CAT "$OCAML_CT_PREFIX$1" \
135 | sed -ne '1s/^#!.*/#!/p')" != '#!' ] ; then
136 # These files are listed manually, rather than via gitattributes,
137 # because the list should never expand, and it should not be trivial to
138 # expand (the unix-execvpe test is an ultra-special-case!)
139 f=${1#./}
140 if [ "$f" != "boot/ocamlc" ] && [ "$f" != "boot/ocamllex" ] && \
141 [ "$f" != "testsuite/tests/lib-unix/unix-execvpe/subdir/script2" ] ; then
142 echo "$1 shouldn't be executable; either:"
143 echo " - Add a #! line"
144 echo " - Run chmod -x $1 (on Unix)"
145 echo " - Run git update-index --chmod=-x $1 (on Windows)"
146 echo "You may wish to check your core.fileMode setting"
147 EXIT_CODE=1
148 fi
149 fi
150 }
151
152 userrules=''
153
154 while : ; do
155 case "$1" in
156 -help|--help) usage;;
157 -*) userrules="${1#-} $userrules"; shift;;
158 --) shift; break;;
159 *) break;;
160 esac
161 done
162
163 IGNORE_DIRS="
164 -name .git -prune -o
165 -type d -exec $0 --check-prune {} ; -prune -o
166 "
167 # `-type d`: simple files (not directories) are not pruned during the
168 # "find" invocation but below (look for "check_prune") for performance
169 # reasons: most files outside pruned directories are not pruned, so it
170 # is faster to optimistically run check-typo on them (and maybe get
171 # out in the middle) than to first check then run.
172
173 TEST_AWK='BEGIN {if ("a{1}" ~ /a{1}/) exit 0}'
174 if $OCAML_CT_AWK "$TEST_AWK" ; then
175 TEST_AWK='BEGIN {if ("a" ~ /a{1}/) exit 0}'
176 if $OCAML_CT_AWK --re-interval "$TEST_AWK" 2>/dev/null ; then
177 OCAML_CT_AWK="$OCAML_CT_AWK --re-interval"
178 else
179 echo "This script requires interval support in regexes ({m} notation)">&2
180 echo "Please install a version of awk (e.g. gawk) which supports this">&2
181 exit 2
182 fi
183 fi
184
185 EXIT_CODE=0
186 ( case $# in
187 0) find . $IGNORE_DIRS -type f -print;;
188 *) for i in "$@"; do find "$i" $IGNORE_DIRS -type f -print; done;;
189 esac
190 ) | (
191 while read f; do
192 if test -n "$(check_prune "$f")"; then continue; fi
193 if $(git check-ignore -q "$f"); then continue; fi
194 case `$OCAML_CT_LS_FILES "$f" 2>&1` in
195 "") path_in_index=false;;
196 *) path_in_index=true;;
197 esac
198 case "$*" in
199 *$f*) is_cmd_line=true;;
200 *) is_cmd_line=false;;
201 esac
202 if [ -z "$OCAML_CT_PREFIX" ] ; then
203 if [ -x "$f" ] ; then
204 check_script "$f"
205 fi
206 else
207 if git ls-files -s "$f" | grep -q "^100755" ; then
208 check_script "$f"
209 fi
210 fi
211 if $path_in_index || $is_cmd_line; then :; else continue; fi
212 attr_rules=''
213 if $path_in_index; then
214 # Below is a git plumbing command to detect whether git regards a
215 # particular file as binary. This takes into account .gitattributes, but
216 # also works if the file has been automatically detected as binary by git.
217 # EMPTY is the hash of the empty tree (which is specially known to git -
218 # it is automatically included in every repository) as a way to get
219 # `diff-tree` to print the whole tree state; its `--numstat` output then
220 # prints a summary where two dashes in the first two columns indicates a
221 # binary file.
222 # (See https://git-scm.com/docs/git-diff-tree#_other_diff_formats and
223 # the documentation for the --numstat option. Commands designated as
224 # "plumbing" commands in git have stable output intended for parsing)
225 EMPTY=`git hash-object -t tree /dev/null`
226 git diff-tree --numstat $EMPTY $OCAML_CT_HEAD -- "$f" \
227 | grep -q "^-[[:blank:]]-" && continue
228 attr_rules=$(get_attrs "$f")
229 fi
230 rules="$userrules"
231
232 # remove newlines, ensure spaces at boundary
233 rules=" $(echo $rules) "
234 attr_rules=" $(echo $attr_rules) "
235
236 if test -n "$(echo "$rules $attr_rules" | grep " utf8 ")"
237 then
238 # grep -a is used to force the file to be considered as text and -x
239 # requires the entire line to match. This specifically detects the
240 # presence of lines containing malformed UTF-8. It may be tested using
241 # https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
242 if $OCAML_CT_CAT "$OCAML_CT_PREFIX$f" \
243 | LC_ALL=en_US.UTF8 grep -qaxv '.*' ; then
244 echo "File \"$f\" is not correctly encoded in UTF-8"
245 exit 2
246 fi
247 fi
248 if ! \
249 ($OCAML_CT_CAT "$OCAML_CT_PREFIX$f" | tr -d '\r'; echo) \
250 | $OCAML_CT_AWK -v rules="$rules" -v attr_rules="$attr_rules" -v file="$f" \
251 '
252 function is_err(name) {
253 return ((rules attr_rules) !~ (" " name "[\\? ]"));
254 }
255
256 function report_err(name, msg) {
257 printf ("%s:%d.%d:", file, NR, RSTART + RLENGTH);
258 printf (" [%s] %s\n", name, msg);
259 got_errors = 1;
260 }
261
262 function err(name, msg) {
263 ++ counts[name];
264 if (is_err(name) && counts[name] <= 10) {
265 report_err(name, msg);
266 if (counts[name] == 10){
267 printf ("WARNING: too many [%s] in this file.", name);
268 printf (" Others will not be reported.\n");
269 }
270 }
271 }
272
273 function err_if(guard, name, msg) {
274 if (is_err(guard)) {
275 err(name, msg);
276 } else {
277 ++ counts[name];
278 }
279 }
280
281 function more_columns(str, limit, c){
282 c = 0;
283 for (i = 1; i <= length(str); i++){
284 if (substr(str, i, 1) == "\t"){
285 c = int((c + 8) / 8) * 8;
286 }else{
287 ++ c;
288 }
289 }
290 return c > limit;
291 }
292
293 function utf8_decode(str) {
294 if (is_err("utf8")) {
295 return str;
296 } else {
297 # This script assumes that the UTF-8 has been externally validated
298 t = str;
299 gsub(/[\300-\367][\200-\277]+/, "?", t);
300 if (t != str) {
301 ++ counts["utf8"];
302 }
303 return t;
304 }
305 }
306
307 BEGIN { state = "(first line)"; }
308
309 match($0, /\t/) {
310 err("tab", "TAB character(s)");
311 t = utf8_decode($0);
312 if (more_columns(t, 80)){
313 RSTART=81;
314 RLENGTH = 0;
315 err_if("very-long-line", "long-line", "line is over 80 columns");
316 }
317 if (more_columns(t, 132)){
318 RSTART=133;
319 RLENGTH = 0;
320 err("very-long-line", "line is over 132 columns");
321 }
322 }
323
324 match($0, /[\200-\377]/) \
325 && state != "authors" && state != "copyright" {
326 if (is_err("utf8")) {
327 err("non-ascii", "non-ASCII character(s)");
328 if (header_utf8 && !is_err("non-ascii")) {
329 err("non-ascii-utf8", \
330 "non-ASCII character(s) AND UTF-8 encountered");
331 }
332 } else {
333 ++ counts["utf8"];
334 }
335 }
336
337 match($0, /[^\t\200-\377 -~]/) {
338 err("non-printing", "non-printing ASCII character(s)");
339 }
340
341 match($0, /[ \t]+$/) {
342 err("white-at-eol", "whitespace at end of line");
343 }
344
345 match($0, /\$Id(: .*)?\$/) {
346 err("svn-keyword", "SVN keyword marker");
347 }
348
349 $0 !~ /\t/ && length($0) > 80 {
350 t = utf8_decode($0);
351 sub(/https?:[A-Za-z0-9._~:\/?#\[\]@!$&\047()*+,;=%-]{73,}$/, "", t);
352 if (length(t) > 80) {
353 RSTART = 81;
354 RLENGTH = 0;
355 err_if("very-long-line", "long-line", "line is over 80 columns");
356 }
357 }
358
359 $0 !~ /\t/ && length($0) > 132 {
360 RSTART = 133;
361 RLENGTH = 0;
362 t = utf8_decode($0);
363 if (length(t) > 132) {
364 err("very-long-line", "line is over 132 columns");
365 }
366 }
367
368 # Record that the header contained UTF-8 sequences
369 match($0, /[\300-\367][\200-\277]+/) \
370 && (state == "authors" || state == "copyright") {
371 header_utf8 = 1;
372 if (counts["non-ascii"] > 0 && is_err("non-ascii")) {
373 err("non-ascii-utf8", \
374 "non-ASCII character(s) AND UTF-8 encountered");
375 }
376 }
377
378 # Header-recognition automaton. Read this from bottom to top.
379 # Valid UTF-8 chars are recognised in copyright and authors
380 # TODO: ensure all files are valid UTF-8 before awking them.
381 # Note that this code also assumes that combining characters are NOT
382 # used (i.e. that every Unicode code-point corresponds to exactly
383 # one displayed character, i.e. no Camels and no including
384 # weird-and-wonderful ways of encoded accented letters).
385
386 state == "close" && $0 ~ /\*{74}/ { state = "OK"; }
387 state == "close" { state = "(last line)"; }
388 state == "blurb" && $0 ~ /\* {72}\*/ { state = "close"; }
389 state == "blurb" && $0 ~ /\/LICENSE/ { state = "(license path)" }
390 state == "blurb1" && $0 ~ /\* All rights reserved. .{47} \*/ \
391 { state = "blurb"; }
392 state == "blurb1" { state = "(blurb line 1)"; }
393 state == "copyright" && $0 ~ /\* {72}\*/ { state = "blurb1"; }
394 state == "copyright" \
395 && $0 !~ /\* Copyright [0-9]{4}([\300-\367][\200-\277]+|.){54} \*/ \
396 && $0 !~ /\* ([\300-\367][\200-\277]+|.){66} \*/ \
397 { state = "(copyright lines)"; }
398 state == "authors" && $0 ~ /\* {72}\*/ { state = "copyright"; }
399 state == "authors" \
400 && $0 !~ /\* ([\300-\367][\200-\277]+|.){70} \*/ \
401 { state = "(authors)"; }
402 state == "blank2" && $0 ~ /\* {72}\*/ { state = "authors"; }
403 state == "blank2" { state = "(blank line 2)"; }
404 state == "title" && $0 ~ /\* {33}OCaml {34}\*/ { state = "blank2"; }
405 state == "title" { state = "(title line)"; }
406 state == "blank1" && $0 ~ /\* {72}\*/ { state = "title"; }
407 state == "blank1" { state = "(blank line 1)"; }
408 state == "(first line)" && NR < 4 && $0 ~ /\*{74}/ { state = "blank1"; }
409
410 {
411 prev_line = last_line;
412 last_line = $0;
413 }
414
415 END {
416 if (match(last_line, /.+/)){
417 err("missing-lf", "missing linefeed at EOF");
418 prev_line = last_line;
419 ++ NR;
420 empty_file = 0;
421 }else{
422 empty_file = NR == 1;
423 }
424 if (!empty_file && match(prev_line, /^$/)){
425 err("white-at-eof", "empty line(s) at EOF");
426 }
427 if (state != "OK"){
428 if (NR >= 10){
429 NR = 1;
430 RSTART = 1;
431 RLENGTH = 0;
432 err("missing-header", sprintf("bad copyright header %s", state));
433 }else{
434 counts["missing-header"] = 1;
435 }
436 }
437 split(attr_rules, r, "[? ]");
438 for (i in r){
439 name = r[i];
440 if (name != "" && !counts[name]){
441 NR = 1;
442 RSTART = 1;
443 RLENGTH = 0;
444 if (attr_rules !~ (" " name "\\? ")) {
445 report_err(name,
446 sprintf("attribute is unused", name));
447 }
448 }
449 }
450 exit got_errors;
451 }
452 ' ; then
453 EXIT_CODE=1
454 fi
455 done
456 exit $EXIT_CODE
457 )
458