1010
1111//! Utilities for manipulating the char type
1212
13- #[ cfg( not( test) ) ]
14- use cmp:: Ord ;
1513use option:: { None , Option , Some } ;
1614use str;
15+ #[ cfg( stage0) ]
16+ use str:: StrSlice ;
17+ #[ cfg( not( stage0) ) ]
18+ use str:: { StrSlice , OwnedStr } ;
1719use u32;
1820use uint;
1921use unicode:: { derived_property, general_category} ;
2022
21- #[ cfg( not( test) ) ] use cmp:: Eq ;
23+ #[ cfg( not( test) ) ]
24+ use cmp:: { Eq , Ord } ;
2225
2326/*
24- Lu Uppercase_Letter an uppercase letter
25- Ll Lowercase_Letter a lowercase letter
26- Lt Titlecase_Letter a digraphic character, with first part uppercase
27- Lm Modifier_Letter a modifier letter
28- Lo Other_Letter other letters, including syllables and ideographs
29- Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
30- Mc Spacing_Mark a spacing combining mark (positive advance width)
31- Me Enclosing_Mark an enclosing combining mark
32- Nd Decimal_Number a decimal digit
33- Nl Letter_Number a letterlike numeric character
34- No Other_Number a numeric character of other type
27+ Lu Uppercase_Letter an uppercase letter
28+ Ll Lowercase_Letter a lowercase letter
29+ Lt Titlecase_Letter a digraphic character, with first part uppercase
30+ Lm Modifier_Letter a modifier letter
31+ Lo Other_Letter other letters, including syllables and ideographs
32+ Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
33+ Mc Spacing_Mark a spacing combining mark (positive advance width)
34+ Me Enclosing_Mark an enclosing combining mark
35+ Nd Decimal_Number a decimal digit
36+ Nl Letter_Number a letterlike numeric character
37+ No Other_Number a numeric character of other type
3538 Pc Connector_Punctuation a connecting punctuation mark, like a tie
36- Pd Dash_Punctuation a dash or hyphen punctuation mark
37- Ps Open_Punctuation an opening punctuation mark (of a pair)
38- Pe Close_Punctuation a closing punctuation mark (of a pair)
39+ Pd Dash_Punctuation a dash or hyphen punctuation mark
40+ Ps Open_Punctuation an opening punctuation mark (of a pair)
41+ Pe Close_Punctuation a closing punctuation mark (of a pair)
3942 Pi Initial_Punctuation an initial quotation mark
40- Pf Final_Punctuation a final quotation mark
41- Po Other_Punctuation a punctuation mark of other type
42- Sm Math_Symbol a symbol of primarily mathematical use
43- Sc Currency_Symbol a currency sign
44- Sk Modifier_Symbol a non-letterlike modifier symbol
45- So Other_Symbol a symbol of other type
46- Zs Space_Separator a space character (of various non-zero widths)
47- Zl Line_Separator U+2028 LINE SEPARATOR only
43+ Pf Final_Punctuation a final quotation mark
44+ Po Other_Punctuation a punctuation mark of other type
45+ Sm Math_Symbol a symbol of primarily mathematical use
46+ Sc Currency_Symbol a currency sign
47+ Sk Modifier_Symbol a non-letterlike modifier symbol
48+ So Other_Symbol a symbol of other type
49+ Zs Space_Separator a space character (of various non-zero widths)
50+ Zl Line_Separator U+2028 LINE SEPARATOR only
4851 Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
49- Cc Control a C0 or C1 control code
50- Cf Format a format control character
51- Cs Surrogate a surrogate code point
52- Co Private_Use a private-use character
53- Cn Unassigned a reserved unassigned code point or a noncharacter
52+ Cc Control a C0 or C1 control code
53+ Cf Format a format control character
54+ Cs Surrogate a surrogate code point
55+ Co Private_Use a private-use character
56+ Cn Unassigned a reserved unassigned code point or a noncharacter
5457*/
5558
5659pub fn is_alphabetic ( c : char ) -> bool { derived_property:: Alphabetic ( c) }
@@ -62,18 +65,14 @@ pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
6265 * in terms of the Unicode General Category 'Ll'
6366 */
6467#[ inline( always) ]
65- pub fn is_lowercase ( c : char ) -> bool {
66- return general_category:: Ll ( c) ;
67- }
68+ pub fn is_lowercase ( c : char ) -> bool { general_category:: Ll ( c) }
6869
6970/**
7071 * Indicates whether a character is in upper case, defined
7172 * in terms of the Unicode General Category 'Lu'.
7273 */
7374#[ inline( always) ]
74- pub fn is_uppercase ( c : char ) -> bool {
75- return general_category:: Lu ( c) ;
76- }
75+ pub fn is_uppercase ( c : char ) -> bool { general_category:: Lu ( c) }
7776
7877/**
7978 * Indicates whether a character is whitespace. Whitespace is defined in
@@ -82,10 +81,10 @@ pub fn is_uppercase(c: char) -> bool {
8281 */
8382#[ inline( always) ]
8483pub fn is_whitespace ( c : char ) -> bool {
85- return ( '\x09' <= c && c <= '\x0d' )
84+ ( '\x09' <= c && c <= '\x0d' )
8685 || general_category:: Zs ( c)
8786 || general_category:: Zl ( c)
88- || general_category:: Zp ( c) ;
87+ || general_category:: Zp ( c)
8988}
9089
9190/**
@@ -95,18 +94,18 @@ pub fn is_whitespace(c: char) -> bool {
9594 */
9695#[ inline( always) ]
9796pub fn is_alphanumeric ( c : char ) -> bool {
98- return derived_property:: Alphabetic ( c) ||
99- general_category:: Nd ( c) ||
100- general_category:: Nl ( c) ||
101- general_category:: No ( c) ;
97+ derived_property:: Alphabetic ( c)
98+ || general_category:: Nd ( c)
99+ || general_category:: Nl ( c)
100+ || general_category:: No ( c)
102101}
103102
104103/// Indicates whether the character is numeric (Nd, Nl, or No)
105104#[ inline( always) ]
106105pub fn is_digit ( c : char ) -> bool {
107- return general_category:: Nd ( c) ||
108- general_category:: Nl ( c) ||
109- general_category:: No ( c) ;
106+ general_category:: Nd ( c)
107+ || general_category:: Nl ( c)
108+ || general_category:: No ( c)
110109}
111110
112111/**
@@ -125,7 +124,7 @@ pub fn is_digit(c: char) -> bool {
125124pub fn is_digit_radix ( c : char , radix : uint ) -> bool {
126125 match to_digit ( c, radix) {
127126 Some ( _) => true ,
128- None => false
127+ None => false ,
129128 }
130129}
131130
@@ -151,7 +150,7 @@ pub fn to_digit(c: char, radix: uint) -> Option<uint> {
151150 '0' .. '9' => c as uint - ( '0' as uint ) ,
152151 'a' .. 'z' => c as uint + 10 u - ( 'a' as uint ) ,
153152 'A' .. 'Z' => c as uint + 10 u - ( 'A' as uint ) ,
154- _ => return None
153+ _ => return None ,
155154 } ;
156155 if val < radix { Some ( val) }
157156 else { None }
@@ -181,6 +180,21 @@ pub fn from_digit(num: uint, radix: uint) -> Option<char> {
181180 }
182181}
183182
183+ #[ cfg( stage0) ]
184+ pub fn escape_unicode ( c : char ) -> ~str {
185+ let s = u32:: to_str_radix ( c as u32 , 16 u) ;
186+ let ( c, pad) = ( if c <= '\xff' { ( 'x' , 2 u) }
187+ else if c <= '\uffff' { ( 'u' , 4 u) }
188+ else { ( 'U' , 8 u) } ) ;
189+ assert ! ( str :: len( s) <= pad) ;
190+ let mut out = ~"\\ ";
191+ str:: push_str ( & mut out, str:: from_char ( c) ) ;
192+ for uint:: range( str:: len( s) , pad) |_i|
193+ { str:: push_str( & mut out, ~"0 ") ; }
194+ str:: push_str ( & mut out, s) ;
195+ out
196+ }
197+
184198/**
185199 * Return the hexadecimal unicode escape of a char.
186200 *
@@ -190,17 +204,21 @@ pub fn from_digit(num: uint, radix: uint) -> Option<char> {
190204 * - chars in [0x100,0xffff] get 4-digit escapes: `\\uNNNN`
191205 * - chars above 0x10000 get 8-digit escapes: `\\UNNNNNNNN`
192206 */
207+ #[ cfg( not( stage0) ) ]
193208pub fn escape_unicode ( c : char ) -> ~str {
194209 let s = u32:: to_str_radix ( c as u32 , 16 u) ;
195- let ( c, pad) = ( if c <= '\xff' { ( 'x' , 2 u) }
196- else if c <= '\uffff' { ( 'u' , 4 u) }
197- else { ( 'U' , 8 u) } ) ;
198- assert ! ( str :: len( s) <= pad) ;
210+ let ( c, pad) = cond ! (
211+ ( c <= '\xff' ) { ( 'x' , 2 u) }
212+ ( c <= '\uffff' ) { ( 'u' , 4 u) }
213+ _ { ( 'U' , 8 u) }
214+ ) ;
215+ assert ! ( s. len( ) <= pad) ;
199216 let mut out = ~"\\ ";
200- str:: push_str ( & mut out, str:: from_char ( c) ) ;
201- for uint:: range( str:: len( s) , pad) |_i|
202- { str:: push_str( & mut out, ~"0 ") ; }
203- str:: push_str ( & mut out, s) ;
217+ out. push_str ( str:: from_char ( c) ) ;
218+ for uint:: range( s. len( ) , pad) |_| {
219+ out. push_str ( "0" ) ;
220+ }
221+ out. push_str ( s) ;
204222 out
205223}
206224
@@ -218,18 +236,18 @@ pub fn escape_unicode(c: char) -> ~str {
218236 */
219237pub fn escape_default ( c : char ) -> ~str {
220238 match c {
221- '\t' => ~"\\ t",
222- '\r' => ~"\\ r",
223- '\n' => ~" \\ n",
224- '\\' => ~"\\ \\ ",
225- '\'' => ~"\\ ' ",
226- '"' => ~"\\ \" ",
227- '\x20' .. '\x7e' => str:: from_char ( c) ,
228- _ => escape_unicode ( c )
239+ '\t' => ~"\\ t",
240+ '\r' => ~"\\ r",
241+ '\n' => ~" \\ n",
242+ '\\' => ~"\\ \\ ",
243+ '\'' => ~"\\ ' ",
244+ '"' => ~"\\ \" ",
245+ '\x20' .. '\x7e' => str:: from_char ( c) ,
246+ _ => c . escape_unicode ( ) ,
229247 }
230248}
231249
232- /// Returns the amount of bytes this character would need if encoded in utf8
250+ # [ cfg ( stage0 ) ]
233251pub fn len_utf8_bytes ( c : char ) -> uint {
234252 static max_one_b: uint = 128 u;
235253 static max_two_b: uint = 2048 u;
@@ -244,6 +262,24 @@ pub fn len_utf8_bytes(c: char) -> uint {
244262 else { fail ! ( "invalid character!" ) }
245263}
246264
265+ /// Returns the amount of bytes this character would need if encoded in utf8
266+ #[ cfg( not( stage0) ) ]
267+ pub fn len_utf8_bytes ( c : char ) -> uint {
268+ static MAX_ONE_B : uint = 128 u;
269+ static MAX_TWO_B : uint = 2048 u;
270+ static MAX_THREE_B : uint = 65536 u;
271+ static MAX_FOUR_B : uint = 2097152 u;
272+
273+ let code = c as uint ;
274+ cond ! (
275+ ( code < MAX_ONE_B ) { 1 u }
276+ ( code < MAX_TWO_B ) { 2 u }
277+ ( code < MAX_THREE_B ) { 3 u }
278+ ( code < MAX_FOUR_B ) { 4 u }
279+ _ { fail!( "invalid character!" ) }
280+ )
281+ }
282+
247283pub trait Char {
248284 fn is_alphabetic ( & self ) -> bool ;
249285 fn is_XID_start ( & self ) -> bool ;
0 commit comments