001    /*
002     * Copyright (C) 2010 The Guava Authors
003     *
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     *
008     * http://www.apache.org/licenses/LICENSE-2.0
009     *
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package com.google.common.base;
018    
019    import com.google.common.annotations.Beta;
020    import com.google.common.annotations.GwtCompatible;
021    
022    /**
023     * Static methods pertaining to ASCII characters (those in the range of values
024     * {@code 0x00} through {@code 0x7F}), and to strings containing such
025     * characters.
026     *
027     * <p>ASCII utilities also exist in other classes of this package:
028     * <ul>
029     * <!-- TODO(kevinb): how can we make this not produce a warning when building gwt javadoc? -->
030     * <li>{@link Charsets#US_ASCII} specifies the {@code Charset} of ASCII characters.
031     * <li>{@link CharMatcher#ASCII} matches ASCII characters and provides text processing methods
032     *     which operate only on the ASCII characters of a string.
033     * </ul>
034     *
035     * @author Craig Berry
036     * @author Gregory Kick
037     * @since 7.0
038     */
039    @GwtCompatible
040    public final class Ascii {
041    
042      private Ascii() {}
043    
044      /* The ASCII control characters, per RFC 20. */
045      /**
046       * Null ('\0'): The all-zeros character which may serve to accomplish
047       * time fill and media fill.  Normally used as a C string terminator.
048       * <p>Although RFC 20 names this as "Null", note that it is distinct
049       * from the C/C++ "NULL" pointer.
050       *
051       * @since 8.0
052       */
053      public static final byte NUL = 0;
054    
055      /**
056       * Start of Heading: A communication control character used at
057       * the beginning of a sequence of characters which constitute a
058       * machine-sensible address or routing information.  Such a sequence is
059       * referred to as the "heading."  An STX character has the effect of
060       * terminating a heading.
061       *
062       * @since 8.0
063       */
064      public static final byte SOH = 1;
065    
066      /**
067       * Start of Text: A communication control character which
068       * precedes a sequence of characters that is to be treated as an entity
069       * and entirely transmitted through to the ultimate destination.  Such a
070       * sequence is referred to as "text."  STX may be used to terminate a
071       * sequence of characters started by SOH.
072       *
073       * @since 8.0
074       */
075      public static final byte STX = 2;
076    
077      /**
078       * End of Text: A communication control character used to
079       * terminate a sequence of characters started with STX and transmitted
080       * as an entity.
081       *
082       * @since 8.0
083       */
084      public static final byte ETX = 3;
085    
086      /**
087       * End of Transmission: A communication control character used
088       * to indicate the conclusion of a transmission, which may have
089       * contained one or more texts and any associated headings.
090       *
091       * @since 8.0
092       */
093      public static final byte EOT = 4;
094    
095      /**
096       * Enquiry: A communication control character used in data
097       * communication systems as a request for a response from a remote
098       * station.  It may be used as a "Who Are You" (WRU) to obtain
099       * identification, or may be used to obtain station status, or both.
100       *
101       * @since 8.0
102       */
103      public static final byte ENQ = 5;
104    
105      /**
106       * Acknowledge: A communication control character transmitted
107       * by a receiver as an affirmative response to a sender.
108       *
109       * @since 8.0
110       */
111      public static final byte ACK = 6;
112    
113      /**
114       * Bell ('\a'): A character for use when there is a need to call for
115       * human attention.  It may control alarm or attention devices.
116       *
117       * @since 8.0
118       */
119      public static final byte BEL = 7;
120    
121      /**
122       * Backspace ('\b'): A format effector which controls the movement of
123       * the printing position one printing space backward on the same
124       * printing line.  (Applicable also to display devices.)
125       *
126       * @since 8.0
127       */
128      public static final byte BS = 8;
129    
130      /**
131       * Horizontal Tabulation ('\t'): A format effector which controls the
132       * movement of the printing position to the next in a series of
133       * predetermined positions along the printing line.  (Applicable also to
134       * display devices and the skip function on punched cards.)
135       *
136       * @since 8.0
137       */
138      public static final byte HT = 9;
139    
140      /**
141       * Line Feed ('\n'): A format effector which controls the movement of
142       * the printing position to the next printing line.  (Applicable also to
143       * display devices.) Where appropriate, this character may have the
144       * meaning "New Line" (NL), a format effector which controls the
145       * movement of the printing point to the first printing position on the
146       * next printing line.  Use of this convention requires agreement
147       * between sender and recipient of data.
148       *
149       * @since 8.0
150       */
151      public static final byte LF = 10;
152    
153      /**
154       * Alternate name for {@link #LF}.  ({@code LF} is preferred.)
155       *
156       * @since 8.0
157       */
158      public static final byte NL = 10;
159    
160      /**
161       * Vertical Tabulation ('\v'): A format effector which controls the
162       * movement of the printing position to the next in a series of
163       * predetermined printing lines.  (Applicable also to display devices.)
164       *
165       * @since 8.0
166       */
167      public static final byte VT = 11;
168    
169      /**
170       * Form Feed ('\f'): A format effector which controls the movement of
171       * the printing position to the first pre-determined printing line on
172       * the next form or page.  (Applicable also to display devices.)
173       *
174       * @since 8.0
175       */
176      public static final byte FF = 12;
177    
178      /**
179       * Carriage Return ('\r'): A format effector which controls the
180       * movement of the printing position to the first printing position on
181       * the same printing line.  (Applicable also to display devices.)
182       *
183       * @since 8.0
184       */
185      public static final byte CR = 13;
186    
187      /**
188       * Shift Out: A control character indicating that the code
189       * combinations which follow shall be interpreted as outside of the
190       * character set of the standard code table until a Shift In character
191       * is reached.
192       *
193       * @since 8.0
194       */
195      public static final byte SO = 14;
196    
197      /**
198       * Shift In: A control character indicating that the code
199       * combinations which follow shall be interpreted according to the
200       * standard code table.
201       *
202       * @since 8.0
203       */
204      public static final byte SI = 15;
205    
206      /**
207       * Data Link Escape: A communication control character which
208       * will change the meaning of a limited number of contiguously following
209       * characters.  It is used exclusively to provide supplementary controls
210       * in data communication networks.
211       *
212       * @since 8.0
213       */
214      public static final byte DLE = 16;
215    
216      /**
217       * Device Controls: Characters for the control
218       * of ancillary devices associated with data processing or
219       * telecommunication systems, more especially switching devices "on" or
220       * "off."  (If a single "stop" control is required to interrupt or turn
221       * off ancillary devices, DC4 is the preferred assignment.)
222       *
223       * @since 8.0
224       */
225      public static final byte DC1 = 17; // aka XON
226    
227      /**
228       * Transmission on/off: Although originally defined as DC1, this ASCII
229       * control character is now better known as the XON code used for software
230       * flow control in serial communications.  The main use is restarting
231       * the transmission after the communication has been stopped by the XOFF
232       * control code.
233       *
234       * @since 8.0
235       */
236      public static final byte XON = 17; // aka DC1
237    
238      /**
239       * @see #DC1
240       *
241       * @since 8.0
242       */
243      public static final byte DC2 = 18;
244    
245      /**
246       * @see #DC1
247       *
248       * @since 8.0
249       */
250      public static final byte DC3 = 19; // aka XOFF
251    
252      /**
253       * Transmission off. @see #XON
254       *
255       * @since 8.0
256       */
257      public static final byte XOFF = 19; // aka DC3
258    
259      /**
260       * @see #DC1
261       *
262       * @since 8.0
263       */
264      public static final byte DC4 = 20;
265    
266      /**
267       * Negative Acknowledge: A communication control character
268       * transmitted by a receiver as a negative response to the sender.
269       *
270       * @since 8.0
271       */
272      public static final byte NAK = 21;
273    
274      /**
275       * Synchronous Idle: A communication control character used by
276       * a synchronous transmission system in the absence of any other
277       * character to provide a signal from which synchronism may be achieved
278       * or retained.
279       *
280       * @since 8.0
281       */
282      public static final byte SYN = 22;
283    
284      /**
285       * End of Transmission Block: A communication control character
286       * used to indicate the end of a block of data for communication
287       * purposes.  ETB is used for blocking data where the block structure is
288       * not necessarily related to the processing format.
289       *
290       * @since 8.0
291       */
292      public static final byte ETB = 23;
293    
294      /**
295       * Cancel: A control character used to indicate that the data
296       * with which it is sent is in error or is to be disregarded.
297       *
298       * @since 8.0
299       */
300      public static final byte CAN = 24;
301    
302      /**
303       * End of Medium: A control character associated with the sent
304       * data which may be used to identify the physical end of the medium, or
305       * the end of the used, or wanted, portion of information recorded on a
306       * medium.  (The position of this character does not necessarily
307       * correspond to the physical end of the medium.)
308       *
309       * @since 8.0
310       */
311      public static final byte EM = 25;
312    
313      /**
314       * Substitute: A character that may be substituted for a
315       * character which is determined to be invalid or in error.
316       *
317       * @since 8.0
318       */
319      public static final byte SUB = 26;
320    
321      /**
322       * Escape: A control character intended to provide code
323       * extension (supplementary characters) in general information
324       * interchange.  The Escape character itself is a prefix affecting the
325       * interpretation of a limited number of contiguously following
326       * characters.
327       *
328       * @since 8.0
329       */
330      public static final byte ESC = 27;
331    
332      /**
333       * File/Group/Record/Unit Separator: These information separators may be
334       * used within data in optional fashion, except that their hierarchical
335       * relationship shall be: FS is the most inclusive, then GS, then RS,
336       * and US is least inclusive.  (The content and length of a File, Group,
337       * Record, or Unit are not specified.)
338       *
339       * @since 8.0
340       */
341      public static final byte FS = 28;
342    
343      /**
344       * @see #FS
345       *
346       * @since 8.0
347       */
348      public static final byte GS = 29;
349    
350      /**
351       * @see #FS
352       *
353       * @since 8.0
354       */
355      public static final byte RS = 30;
356    
357      /**
358       * @see #FS
359       *
360       * @since 8.0
361       */
362      public static final byte US = 31;
363    
364      /**
365       * Space: A normally non-printing graphic character used to
366       * separate words.  It is also a format effector which controls the
367       * movement of the printing position, one printing position forward.
368       * (Applicable also to display devices.)
369       *
370       * @since 8.0
371       */
372      public static final byte SP = 32;
373    
374      /**
375       * Alternate name for {@link #SP}.
376       *
377       * @since 8.0
378       */
379      public static final byte SPACE = 32;
380    
381      /**
382       * Delete: This character is used primarily to "erase" or
383       * "obliterate" erroneous or unwanted characters in perforated tape.
384       *
385       * @since 8.0
386       */
387      public static final byte DEL = 127;
388    
389      /**
390       * The minimum value of an ASCII character.
391       *
392       * @since 9.0 (was type {@code int} before 12.0)
393       */
394      @Beta
395      public static final char MIN = 0;
396    
397      /**
398       * The maximum value of an ASCII character.
399       *
400       * @since 9.0 (was type {@code int} before 12.0)
401       */
402      @Beta
403      public static final char MAX = 127;
404    
405      /**
406       * Returns a copy of the input string in which all {@linkplain #isUpperCase(char) uppercase ASCII
407       * characters} have been converted to lowercase. All other characters are copied without
408       * modification.
409       */
410      public static String toLowerCase(String string) {
411        int length = string.length();
412        StringBuilder builder = new StringBuilder(length);
413        for (int i = 0; i < length; i++) {
414          builder.append(toLowerCase(string.charAt(i)));
415        }
416        return builder.toString();
417      }
418    
419      /**
420       * If the argument is an {@linkplain #isUpperCase(char) uppercase ASCII character} returns the
421       * lowercase equivalent. Otherwise returns the argument.
422       */
423      public static char toLowerCase(char c) {
424        return isUpperCase(c) ? (char) (c ^ 0x20) : c;
425      }
426    
427      /**
428       * Returns a copy of the input string in which all {@linkplain #isLowerCase(char) lowercase ASCII
429       * characters} have been converted to uppercase. All other characters are copied without
430       * modification.
431       */
432      public static String toUpperCase(String string) {
433        int length = string.length();
434        StringBuilder builder = new StringBuilder(length);
435        for (int i = 0; i < length; i++) {
436          builder.append(toUpperCase(string.charAt(i)));
437        }
438        return builder.toString();
439      }
440    
441      /**
442       * If the argument is a {@linkplain #isLowerCase(char) lowercase ASCII character} returns the
443       * uppercase equivalent. Otherwise returns the argument.
444       */
445      public static char toUpperCase(char c) {
446        return isLowerCase(c) ? (char) (c & 0x5f) : c;
447      }
448    
449      /**
450       * Indicates whether {@code c} is one of the twenty-six lowercase ASCII alphabetic characters
451       * between {@code 'a'} and {@code 'z'} inclusive. All others (including non-ASCII characters)
452       * return {@code false}.
453       */
454      public static boolean isLowerCase(char c) {
455        return (c >= 'a') && (c <= 'z');
456      }
457    
458      /**
459       * Indicates whether {@code c} is one of the twenty-six uppercase ASCII alphabetic characters
460       * between {@code 'A'} and {@code 'Z'} inclusive. All others (including non-ASCII characters)
461       * return {@code false}.
462       */
463      public static boolean isUpperCase(char c) {
464        return (c >= 'A') && (c <= 'Z');
465      }
466    }