Skip to content

Commit 0c078fc

Browse files
committed
patch 8.0.0519: character classes are not well tested
Problem: Character classes are not well tested. They can differ between platforms. Solution: Add tests. In the documentation make clear which classes depend on what library function. Only use :cntrl: and :graph: for ASCII. (Kazunobu Kuriyama, Dominique Pelle, closes #1560) Update the documentation.
1 parent c6cd840 commit 0c078fc

5 files changed

Lines changed: 90 additions & 32 deletions

File tree

runtime/doc/pattern.txt

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1085,25 +1085,27 @@ x A single character, with no special meaning, matches itself
10851085
- A character class expression is evaluated to the set of characters
10861086
belonging to that character class. The following character classes
10871087
are supported:
1088-
Name Contents ~
1089-
*[:alnum:]* [:alnum:] ASCII letters and digits
1090-
*[:alpha:]* [:alpha:] ASCII letters
1091-
*[:blank:]* [:blank:] space and tab characters
1092-
*[:cntrl:]* [:cntrl:] control characters
1093-
*[:digit:]* [:digit:] decimal digits
1094-
*[:graph:]* [:graph:] printable characters excluding space
1095-
*[:lower:]* [:lower:] lowercase letters (all letters when
1088+
Name Func Contents ~
1089+
*[:alnum:]* [:alnum:] isalnum ASCII letters and digits
1090+
*[:alpha:]* [:alpha:] isalpha ASCII letters
1091+
*[:blank:]* [:blank:] space and tab
1092+
*[:cntrl:]* [:cntrl:] iscntrl ASCII control characters
1093+
*[:digit:]* [:digit:] decimal digits '0' to '9'
1094+
*[:graph:]* [:graph:] isgraph ASCII printable characters excluding
1095+
space
1096+
*[:lower:]* [:lower:] (1) lowercase letters (all letters when
10961097
'ignorecase' is used)
1097-
*[:print:]* [:print:] printable characters including space
1098-
*[:punct:]* [:punct:] ASCII punctuation characters
1099-
*[:space:]* [:space:] whitespace characters
1100-
*[:upper:]* [:upper:] uppercase letters (all letters when
1098+
*[:print:]* [:print:] (2) printable characters including space
1099+
*[:punct:]* [:punct:] ispunct ASCII punctuation characters
1100+
*[:space:]* [:space:] whitespace characters: space, tab, CR,
1101+
NL, vertical tab, form feed
1102+
*[:upper:]* [:upper:] (3) uppercase letters (all letters when
11011103
'ignorecase' is used)
1102-
*[:xdigit:]* [:xdigit:] hexadecimal digits
1103-
*[:return:]* [:return:] the <CR> character
1104-
*[:tab:]* [:tab:] the <Tab> character
1105-
*[:escape:]* [:escape:] the <Esc> character
1106-
*[:backspace:]* [:backspace:] the <BS> character
1104+
*[:xdigit:]* [:xdigit:] hexadecimal digits: 0-9, a-f, A-F
1105+
*[:return:]* [:return:] the <CR> character
1106+
*[:tab:]* [:tab:] the <Tab> character
1107+
*[:escape:]* [:escape:] the <Esc> character
1108+
*[:backspace:]* [:backspace:] the <BS> character
11071109
The brackets in character class expressions are additional to the
11081110
brackets delimiting a collection. For example, the following is a
11091111
plausible pattern for a UNIX filename: "[-./[:alnum:]_~]\+" That is,
@@ -1114,6 +1116,13 @@ x A single character, with no special meaning, matches itself
11141116
regexp engine. See |two-engines|. In the future these items may
11151117
work for multi-byte characters. For now, to get all "alpha"
11161118
characters you can use: [[:lower:][:upper:]].
1119+
1120+
The "Func" column shows what library function is used. The
1121+
implementation depends on the system. Otherwise:
1122+
(1) Uses islower() for ASCII and Vim builtin rules for other
1123+
characters when built with the |+multi_byte| feature.
1124+
(2) Uses Vim builtin rules
1125+
(3) As with (1) but using isupper()
11171126
*/[[=* *[==]*
11181127
- An equivalence class. This means that characters are matched that
11191128
have almost the same meaning, e.g., when ignoring accents. This

src/regexp.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2555,17 +2555,17 @@ regatom(int *flagp)
25552555
regc('\t');
25562556
break;
25572557
case CLASS_CNTRL:
2558-
for (cu = 1; cu <= 255; cu++)
2558+
for (cu = 1; cu <= 127; cu++)
25592559
if (iscntrl(cu))
25602560
regmbc(cu);
25612561
break;
25622562
case CLASS_DIGIT:
2563-
for (cu = 1; cu <= 255; cu++)
2563+
for (cu = 1; cu <= 127; cu++)
25642564
if (VIM_ISDIGIT(cu))
25652565
regmbc(cu);
25662566
break;
25672567
case CLASS_GRAPH:
2568-
for (cu = 1; cu <= 255; cu++)
2568+
for (cu = 1; cu <= 127; cu++)
25692569
if (isgraph(cu))
25702570
regmbc(cu);
25712571
break;

src/regexp_nfa.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4871,15 +4871,15 @@ check_char_class(int class, int c)
48714871
return OK;
48724872
break;
48734873
case NFA_CLASS_CNTRL:
4874-
if (c >= 1 && c <= 255 && iscntrl(c))
4874+
if (c >= 1 && c <= 127 && iscntrl(c))
48754875
return OK;
48764876
break;
48774877
case NFA_CLASS_DIGIT:
48784878
if (VIM_ISDIGIT(c))
48794879
return OK;
48804880
break;
48814881
case NFA_CLASS_GRAPH:
4882-
if (c >= 1 && c <= 255 && isgraph(c))
4882+
if (c >= 1 && c <= 127 && isgraph(c))
48834883
return OK;
48844884
break;
48854885
case NFA_CLASS_LOWER:

src/testdir/test_regexp_utf8.vim

Lines changed: 57 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,46 +38,93 @@ func s:classes_test()
3838
set isprint=@,161-255
3939
call assert_equal('Motörhead', matchstr('Motörhead', '[[:print:]]\+'))
4040

41+
let alnumchars = ''
4142
let alphachars = ''
43+
let backspacechar = ''
44+
let blankchars = ''
45+
let cntrlchars = ''
46+
let digitchars = ''
47+
let escapechar = ''
48+
let graphchars = ''
4249
let lowerchars = ''
43-
let upperchars = ''
44-
let alnumchars = ''
4550
let printchars = ''
4651
let punctchars = ''
52+
let returnchar = ''
53+
let spacechars = ''
54+
let tabchar = ''
55+
let upperchars = ''
4756
let xdigitchars = ''
4857
let i = 1
4958
while i <= 255
5059
let c = nr2char(i)
5160
if c =~ '[[:alpha:]]'
5261
let alphachars .= c
5362
endif
54-
if c =~ '[[:lower:]]'
55-
let lowerchars .= c
56-
endif
57-
if c =~ '[[:upper:]]'
58-
let upperchars .= c
59-
endif
6063
if c =~ '[[:alnum:]]'
6164
let alnumchars .= c
6265
endif
66+
if c =~ '[[:backspace:]]'
67+
let backspacechar .= c
68+
endif
69+
if c =~ '[[:blank:]]'
70+
let blankchars .= c
71+
endif
72+
if c =~ '[[:cntrl:]]'
73+
let cntrlchars .= c
74+
endif
75+
if c =~ '[[:digit:]]'
76+
let digitchars .= c
77+
endif
78+
if c =~ '[[:escape:]]'
79+
let escapechar .= c
80+
endif
81+
if c =~ '[[:graph:]]'
82+
let graphchars .= c
83+
endif
84+
if c =~ '[[:lower:]]'
85+
let lowerchars .= c
86+
endif
6387
if c =~ '[[:print:]]'
6488
let printchars .= c
6589
endif
6690
if c =~ '[[:punct:]]'
6791
let punctchars .= c
6892
endif
93+
if c =~ '[[:return:]]'
94+
let returnchar .= c
95+
endif
96+
if c =~ '[[:space:]]'
97+
let spacechars .= c
98+
endif
99+
if c =~ '[[:tab:]]'
100+
let tabchar .= c
101+
endif
102+
if c =~ '[[:upper:]]'
103+
let upperchars .= c
104+
endif
69105
if c =~ '[[:xdigit:]]'
70106
let xdigitchars .= c
71107
endif
72108
let i += 1
73109
endwhile
74110

75111
call assert_equal('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', alphachars)
76-
call assert_equal('abcdefghijklmnopqrstuvwxyzµßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', lowerchars)
77-
call assert_equal('ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ', upperchars)
78112
call assert_equal('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', alnumchars)
113+
call assert_equal("\b", backspacechar)
114+
call assert_equal("\t ", blankchars)
115+
" Commented out: it succeeds on Linux and Windows, but fails on macOs in Travis.
116+
" call assert_equal("\x01\x02\x03\x04\x05\x06\x07\b\t\n\x0b\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\e\x1c\x1d\x1e\x1f\x7f", cntrlchars)
117+
call assert_equal("0123456789", digitchars)
118+
call assert_equal("\<Esc>", escapechar)
119+
" Commented out: it succeeds on Linux and Windows, but fails on macOs in Travis.
120+
" call assert_equal('!"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~', graphchars)
121+
call assert_equal('abcdefghijklmnopqrstuvwxyzµßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', lowerchars)
79122
call assert_equal(' !"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ', printchars)
80123
call assert_equal('!"#$%&''()*+,-./:;<=>?@[\]^_`{|}~', punctchars)
124+
call assert_equal('ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ', upperchars)
125+
call assert_equal("\r", returnchar)
126+
call assert_equal("\t\n\x0b\f\r ", spacechars)
127+
call assert_equal("\t", tabchar)
81128
call assert_equal('0123456789ABCDEFabcdef', xdigitchars)
82129
endfunc
83130

src/version.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -764,6 +764,8 @@ static char *(features[]) =
764764

765765
static int included_patches[] =
766766
{ /* Add new patch number below this line */
767+
/**/
768+
519,
767769
/**/
768770
518,
769771
/**/

0 commit comments

Comments
 (0)