@@ -4,96 +4,75 @@ Module: char
44Utilities for manipulating the char type
55*/
66
7+ /*
8+ Lu Uppercase_Letter an uppercase letter
9+ Ll Lowercase_Letter a lowercase letter
10+ Lt Titlecase_Letter a digraphic character, with first part uppercase
11+ Lm Modifier_Letter a modifier letter
12+ Lo Other_Letter other letters, including syllables and ideographs
13+ Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
14+ Mc Spacing_Mark a spacing combining mark (positive advance width)
15+ Me Enclosing_Mark an enclosing combining mark
16+ Nd Decimal_Number a decimal digit
17+ Nl Letter_Number a letterlike numeric character
18+ No Other_Number a numeric character of other type
19+ Pc Connector_Punctuation a connecting punctuation mark, like a tie
20+ Pd Dash_Punctuation a dash or hyphen punctuation mark
21+ Ps Open_Punctuation an opening punctuation mark (of a pair)
22+ Pe Close_Punctuation a closing punctuation mark (of a pair)
23+ Pi Initial_Punctuation an initial quotation mark
24+ Pf Final_Punctuation a final quotation mark
25+ Po Other_Punctuation a punctuation mark of other type
26+ Sm Math_Symbol a symbol of primarily mathematical use
27+ Sc Currency_Symbol a currency sign
28+ Sk Modifier_Symbol a non-letterlike modifier symbol
29+ So Other_Symbol a symbol of other type
30+ Zs Space_Separator a space character (of various non-zero widths)
31+ Zl Line_Separator U+2028 LINE SEPARATOR only
32+ Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
33+ Cc Control a C0 or C1 control code
34+ Cf Format a format control character
35+ Cs Surrogate a surrogate code point
36+ Co Private_Use a private-use character
37+ Cn Unassigned a reserved unassigned code point or a noncharacter
38+ */
39+
40+ import is_alphabetic = unicode:: derived_property:: Alphabetic ;
41+ import is_XID_start = unicode:: derived_property:: XID_Start ;
42+ import is_XID_continue = unicode:: derived_property:: XID_Continue ;
43+
744/*
845Function: is_whitespace
946
10- Indicates whether a character is whitespace.
47+ Indicates whether a character is whitespace, defined in terms of
48+ the Unicode General Categories 'Zs', 'Zl', 'Zp' and the additional
49+ 'Cc'-category control codes in the range [0x09, 0x0d].
1150
12- Whitespace characters include space (U+0020), tab (U+0009), line feed
13- (U+000A), carriage return (U+000D), and a number of less common
14- ASCII and unicode characters.
1551*/
1652pure fn is_whitespace ( c : char ) -> bool {
17- const ch_space: char = '\u0020' ;
18- const ch_ogham_space_mark: char = '\u1680' ;
19- const ch_mongolian_vowel_sep: char = '\u180e' ;
20- const ch_en_quad: char = '\u2000' ;
21- const ch_em_quad: char = '\u2001' ;
22- const ch_en_space: char = '\u2002' ;
23- const ch_em_space: char = '\u2003' ;
24- const ch_three_per_em_space: char = '\u2004' ;
25- const ch_four_per_em_space: char = '\u2005' ;
26- const ch_six_per_em_space: char = '\u2006' ;
27- const ch_figure_space: char = '\u2007' ;
28- const ch_punctuation_space: char = '\u2008' ;
29- const ch_thin_space: char = '\u2009' ;
30- const ch_hair_space: char = '\u200a' ;
31- const ch_narrow_no_break_space: char = '\u202f' ;
32- const ch_medium_mathematical_space: char = '\u205f' ;
33- const ch_ideographic_space: char = '\u3000' ;
34- const ch_line_separator: char = '\u2028' ;
35- const ch_paragraph_separator: char = '\u2029' ;
36- const ch_character_tabulation: char = '\u0009' ;
37- const ch_line_feed: char = '\u000a' ;
38- const ch_line_tabulation: char = '\u000b' ;
39- const ch_form_feed: char = '\u000c' ;
40- const ch_carriage_return: char = '\u000d' ;
41- const ch_next_line: char = '\u0085' ;
42- const ch_no_break_space: char = '\u00a0' ;
43-
44- if c == ch_space {
45- true
46- } else if c == ch_ogham_space_mark {
47- true
48- } else if c == ch_mongolian_vowel_sep {
49- true
50- } else if c == ch_en_quad {
51- true
52- } else if c == ch_em_quad {
53- true
54- } else if c == ch_en_space {
55- true
56- } else if c == ch_em_space {
57- true
58- } else if c == ch_three_per_em_space {
59- true
60- } else if c == ch_four_per_em_space {
61- true
62- } else if c == ch_six_per_em_space {
63- true
64- } else if c == ch_figure_space {
65- true
66- } else if c == ch_punctuation_space {
67- true
68- } else if c == ch_thin_space {
69- true
70- } else if c == ch_hair_space {
71- true
72- } else if c == ch_narrow_no_break_space {
73- true
74- } else if c == ch_medium_mathematical_space {
75- true
76- } else if c == ch_ideographic_space {
77- true
78- } else if c == ch_line_tabulation {
79- true
80- } else if c == ch_paragraph_separator {
81- true
82- } else if c == ch_character_tabulation {
83- true
84- } else if c == ch_line_feed {
85- true
86- } else if c == ch_line_tabulation {
87- true
88- } else if c == ch_form_feed {
89- true
90- } else if c == ch_carriage_return {
91- true
92- } else if c == ch_next_line {
93- true
94- } else if c == ch_no_break_space { true } else { false }
53+ ret ( '\x09' <= c && c <= ' \x0x0d' )
54+ || unicode:: general_category:: Zs ( c)
55+ || unicode:: general_category:: Zl ( c)
56+ || unicode:: general_category:: Zp ( c) ;
57+ }
58+
59+ /*
60+ Function: is_alphanumeric
61+
62+ Indicates whether a character is alphanumeric, defined in terms of
63+ the Unicode General Categories 'Nd', 'Nl', 'No' and the Derived
64+ Core Property 'Alphabetic'.
65+
66+ */
67+
68+ pure fn is_alphanumeric ( c : char ) -> bool {
69+ ret unicode:: derived_property:: Alphabetic ( c) ||
70+ unicode:: general_category:: Nd ( c) ||
71+ unicode:: general_category:: Nl ( c) ||
72+ unicode:: general_category:: No ( c) ;
9573}
9674
75+
9776/*
9877 Function: to_digit
9978
0 commit comments