Source code

001/*
002 * Copyright (C) 2010 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package com.google.common.base;
018
019import static com.google.common.base.Preconditions.checkArgument;
020import static com.google.common.base.Preconditions.checkNotNull;
021
022import com.google.common.annotations.Beta;
023import com.google.common.annotations.GwtCompatible;
024
025import javax.annotation.CheckReturnValue;
026
027/**
028 * Static methods pertaining to ASCII characters (those in the range of values
029 * {@code 0x00} through {@code 0x7F}), and to strings containing such
030 * characters.
031 *
032 * <p>ASCII utilities also exist in other classes of this package:
033 * <ul>
034 * <!-- TODO(kevinb): how can we make this not produce a warning when building gwt javadoc? -->
035 * <li>{@link Charsets#US_ASCII} specifies the {@code Charset} of ASCII characters.
036 * <li>{@link CharMatcher#ASCII} matches ASCII characters and provides text processing methods
037 *     which operate only on the ASCII characters of a string.
038 * </ul>
039 *
040 * @author Craig Berry
041 * @author Gregory Kick
042 * @since 7.0
043 */
044@GwtCompatible
045public final class Ascii {
046
047  private Ascii() {}
048
049  /* The ASCII control characters, per RFC 20. */
050  /**
051   * Null ('\0'): The all-zeros character which may serve to accomplish
052   * time fill and media fill.  Normally used as a C string terminator.
053   * <p>Although RFC 20 names this as "Null", note that it is distinct
054   * from the C/C++ "NULL" pointer.
055   *
056   * @since 8.0
057   */
058  public static final byte NUL = 0;
059
060  /**
061   * Start of Heading: A communication control character used at
062   * the beginning of a sequence of characters which constitute a
063   * machine-sensible address or routing information.  Such a sequence is
064   * referred to as the "heading."  An STX character has the effect of
065   * terminating a heading.
066   *
067   * @since 8.0
068   */
069  public static final byte SOH = 1;
070
071  /**
072   * Start of Text: A communication control character which
073   * precedes a sequence of characters that is to be treated as an entity
074   * and entirely transmitted through to the ultimate destination.  Such a
075   * sequence is referred to as "text."  STX may be used to terminate a
076   * sequence of characters started by SOH.
077   *
078   * @since 8.0
079   */
080  public static final byte STX = 2;
081
082  /**
083   * End of Text: A communication control character used to
084   * terminate a sequence of characters started with STX and transmitted
085   * as an entity.
086   *
087   * @since 8.0
088   */
089  public static final byte ETX = 3;
090
091  /**
092   * End of Transmission: A communication control character used
093   * to indicate the conclusion of a transmission, which may have
094   * contained one or more texts and any associated headings.
095   *
096   * @since 8.0
097   */
098  public static final byte EOT = 4;
099
100  /**
101   * Enquiry: A communication control character used in data
102   * communication systems as a request for a response from a remote
103   * station.  It may be used as a "Who Are You" (WRU) to obtain
104   * identification, or may be used to obtain station status, or both.
105   *
106   * @since 8.0
107   */
108  public static final byte ENQ = 5;
109
110  /**
111   * Acknowledge: A communication control character transmitted
112   * by a receiver as an affirmative response to a sender.
113   *
114   * @since 8.0
115   */
116  public static final byte ACK = 6;
117
118  /**
119   * Bell ('\a'): A character for use when there is a need to call for
120   * human attention.  It may control alarm or attention devices.
121   *
122   * @since 8.0
123   */
124  public static final byte BEL = 7;
125
126  /**
127   * Backspace ('\b'): A format effector which controls the movement of
128   * the printing position one printing space backward on the same
129   * printing line.  (Applicable also to display devices.)
130   *
131   * @since 8.0
132   */
133  public static final byte BS = 8;
134
135  /**
136   * Horizontal Tabulation ('\t'): A format effector which controls the
137   * movement of the printing position to the next in a series of
138   * predetermined positions along the printing line.  (Applicable also to
139   * display devices and the skip function on punched cards.)
140   *
141   * @since 8.0
142   */
143  public static final byte HT = 9;
144
145  /**
146   * Line Feed ('\n'): A format effector which controls the movement of
147   * the printing position to the next printing line.  (Applicable also to
148   * display devices.) Where appropriate, this character may have the
149   * meaning "New Line" (NL), a format effector which controls the
150   * movement of the printing point to the first printing position on the
151   * next printing line.  Use of this convention requires agreement
152   * between sender and recipient of data.
153   *
154   * @since 8.0
155   */
156  public static final byte LF = 10;
157
158  /**
159   * Alternate name for {@link #LF}.  ({@code LF} is preferred.)
160   *
161   * @since 8.0
162   */
163  public static final byte NL = 10;
164
165  /**
166   * Vertical Tabulation ('\v'): A format effector which controls the
167   * movement of the printing position to the next in a series of
168   * predetermined printing lines.  (Applicable also to display devices.)
169   *
170   * @since 8.0
171   */
172  public static final byte VT = 11;
173
174  /**
175   * Form Feed ('\f'): A format effector which controls the movement of
176   * the printing position to the first pre-determined printing line on
177   * the next form or page.  (Applicable also to display devices.)
178   *
179   * @since 8.0
180   */
181  public static final byte FF = 12;
182
183  /**
184   * Carriage Return ('\r'): A format effector which controls the
185   * movement of the printing position to the first printing position on
186   * the same printing line.  (Applicable also to display devices.)
187   *
188   * @since 8.0
189   */
190  public static final byte CR = 13;
191
192  /**
193   * Shift Out: A control character indicating that the code
194   * combinations which follow shall be interpreted as outside of the
195   * character set of the standard code table until a Shift In character
196   * is reached.
197   *
198   * @since 8.0
199   */
200  public static final byte SO = 14;
201
202  /**
203   * Shift In: A control character indicating that the code
204   * combinations which follow shall be interpreted according to the
205   * standard code table.
206   *
207   * @since 8.0
208   */
209  public static final byte SI = 15;
210
211  /**
212   * Data Link Escape: A communication control character which
213   * will change the meaning of a limited number of contiguously following
214   * characters.  It is used exclusively to provide supplementary controls
215   * in data communication networks.
216   *
217   * @since 8.0
218   */
219  public static final byte DLE = 16;
220
221  /**
222   * Device Control 1. Characters for the control
223   * of ancillary devices associated with data processing or
224   * telecommunication systems, more especially switching devices "on" or
225   * "off."  (If a single "stop" control is required to interrupt or turn
226   * off ancillary devices, DC4 is the preferred assignment.)
227   *
228   * @since 8.0
229   */
230  public static final byte DC1 = 17; // aka XON
231
232  /**
233   * Transmission On: Although originally defined as DC1, this ASCII
234   * control character is now better known as the XON code used for software
235   * flow control in serial communications.  The main use is restarting
236   * the transmission after the communication has been stopped by the XOFF
237   * control code.
238   *
239   * @since 8.0
240   */
241  public static final byte XON = 17; // aka DC1
242
243  /**
244   * Device Control 2. Characters for the control
245   * of ancillary devices associated with data processing or
246   * telecommunication systems, more especially switching devices "on" or
247   * "off."  (If a single "stop" control is required to interrupt or turn
248   * off ancillary devices, DC4 is the preferred assignment.)
249   *
250   * @since 8.0
251   */
252  public static final byte DC2 = 18;
253
254  /**
255   * Device Control 3. Characters for the control
256   * of ancillary devices associated with data processing or
257   * telecommunication systems, more especially switching devices "on" or
258   * "off."  (If a single "stop" control is required to interrupt or turn
259   * off ancillary devices, DC4 is the preferred assignment.)
260   *
261   * @since 8.0
262   */
263  public static final byte DC3 = 19; // aka XOFF
264
265  /**
266   * Transmission off. See {@link #XON} for explanation.
267   *
268   * @since 8.0
269   */
270  public static final byte XOFF = 19; // aka DC3
271
272  /**
273   * Device Control 4. Characters for the control
274   * of ancillary devices associated with data processing or
275   * telecommunication systems, more especially switching devices "on" or
276   * "off."  (If a single "stop" control is required to interrupt or turn
277   * off ancillary devices, DC4 is the preferred assignment.)
278   *
279   * @since 8.0
280   */
281  public static final byte DC4 = 20;
282
283  /**
284   * Negative Acknowledge: A communication control character
285   * transmitted by a receiver as a negative response to the sender.
286   *
287   * @since 8.0
288   */
289  public static final byte NAK = 21;
290
291  /**
292   * Synchronous Idle: A communication control character used by
293   * a synchronous transmission system in the absence of any other
294   * character to provide a signal from which synchronism may be achieved
295   * or retained.
296   *
297   * @since 8.0
298   */
299  public static final byte SYN = 22;
300
301  /**
302   * End of Transmission Block: A communication control character
303   * used to indicate the end of a block of data for communication
304   * purposes.  ETB is used for blocking data where the block structure is
305   * not necessarily related to the processing format.
306   *
307   * @since 8.0
308   */
309  public static final byte ETB = 23;
310
311  /**
312   * Cancel: A control character used to indicate that the data
313   * with which it is sent is in error or is to be disregarded.
314   *
315   * @since 8.0
316   */
317  public static final byte CAN = 24;
318
319  /**
320   * End of Medium: A control character associated with the sent
321   * data which may be used to identify the physical end of the medium, or
322   * the end of the used, or wanted, portion of information recorded on a
323   * medium.  (The position of this character does not necessarily
324   * correspond to the physical end of the medium.)
325   *
326   * @since 8.0
327   */
328  public static final byte EM = 25;
329
330  /**
331   * Substitute: A character that may be substituted for a
332   * character which is determined to be invalid or in error.
333   *
334   * @since 8.0
335   */
336  public static final byte SUB = 26;
337
338  /**
339   * Escape: A control character intended to provide code
340   * extension (supplementary characters) in general information
341   * interchange.  The Escape character itself is a prefix affecting the
342   * interpretation of a limited number of contiguously following
343   * characters.
344   *
345   * @since 8.0
346   */
347  public static final byte ESC = 27;
348
349  /**
350   * File Separator: These four information separators may be
351   * used within data in optional fashion, except that their hierarchical
352   * relationship shall be: FS is the most inclusive, then GS, then RS,
353   * and US is least inclusive.  (The content and length of a File, Group,
354   * Record, or Unit are not specified.)
355   *
356   * @since 8.0
357   */
358  public static final byte FS = 28;
359
360  /**
361   * Group Separator: These four information separators may be
362   * used within data in optional fashion, except that their hierarchical
363   * relationship shall be: FS is the most inclusive, then GS, then RS,
364   * and US is least inclusive.  (The content and length of a File, Group,
365   * Record, or Unit are not specified.)
366   *
367   * @since 8.0
368   */
369  public static final byte GS = 29;
370
371  /**
372   * Record Separator: These four information separators may be
373   * used within data in optional fashion, except that their hierarchical
374   * relationship shall be: FS is the most inclusive, then GS, then RS,
375   * and US is least inclusive.  (The content and length of a File, Group,
376   * Record, or Unit are not specified.)
377   *
378   * @since 8.0
379   */
380  public static final byte RS = 30;
381
382  /**
383   * Unit Separator: These four information separators may be
384   * used within data in optional fashion, except that their hierarchical
385   * relationship shall be: FS is the most inclusive, then GS, then RS,
386   * and US is least inclusive.  (The content and length of a File, Group,
387   * Record, or Unit are not specified.)
388   *
389   * @since 8.0
390   */
391  public static final byte US = 31;
392
393  /**
394   * Space: A normally non-printing graphic character used to
395   * separate words.  It is also a format effector which controls the
396   * movement of the printing position, one printing position forward.
397   * (Applicable also to display devices.)
398   *
399   * @since 8.0
400   */
401  public static final byte SP = 32;
402
403  /**
404   * Alternate name for {@link #SP}.
405   *
406   * @since 8.0
407   */
408  public static final byte SPACE = 32;
409
410  /**
411   * Delete: This character is used primarily to "erase" or
412   * "obliterate" erroneous or unwanted characters in perforated tape.
413   *
414   * @since 8.0
415   */
416  public static final byte DEL = 127;
417
418  /**
419   * The minimum value of an ASCII character.
420   *
421   * @since 9.0 (was type {@code int} before 12.0)
422   */
423  public static final char MIN = 0;
424
425  /**
426   * The maximum value of an ASCII character.
427   *
428   * @since 9.0 (was type {@code int} before 12.0)
429   */
430  public static final char MAX = 127;
431
432  /**
433   * Returns a copy of the input string in which all {@linkplain #isUpperCase(char) uppercase ASCII
434   * characters} have been converted to lowercase. All other characters are copied without
435   * modification.
436   */
437  public static String toLowerCase(String string) {
438    int length = string.length();
439    for (int i = 0; i < length; i++) {
440      if (isUpperCase(string.charAt(i))) {
441        char[] chars = string.toCharArray();
442        for (; i < length; i++) {
443          char c = chars[i];
444          if (isUpperCase(c)) {
445            chars[i] = (char) (c ^ 0x20);
446          }
447        }
448        return String.valueOf(chars);
449      }
450    }
451    return string;
452  }
453
454  /**
455   * Returns a copy of the input character sequence in which all {@linkplain #isUpperCase(char)
456   * uppercase ASCII characters} have been converted to lowercase. All other characters are copied
457   * without modification.
458   *
459   * @since 14.0
460   */
461  public static String toLowerCase(CharSequence chars) {
462    if (chars instanceof String) {
463      return toLowerCase((String) chars);
464    }
465    int length = chars.length();
466    StringBuilder builder = new StringBuilder(length);
467    for (int i = 0; i < length; i++) {
468      builder.append(toLowerCase(chars.charAt(i)));
469    }
470    return builder.toString();
471  }
472
473  /**
474   * If the argument is an {@linkplain #isUpperCase(char) uppercase ASCII character} returns the
475   * lowercase equivalent. Otherwise returns the argument.
476   */
477  public static char toLowerCase(char c) {
478    return isUpperCase(c) ? (char) (c ^ 0x20) : c;
479  }
480
481  /**
482   * Returns a copy of the input string in which all {@linkplain #isLowerCase(char) lowercase ASCII
483   * characters} have been converted to uppercase. All other characters are copied without
484   * modification.
485   */
486  public static String toUpperCase(String string) {
487    int length = string.length();
488    for (int i = 0; i < length; i++) {
489      if (isLowerCase(string.charAt(i))) {
490        char[] chars = string.toCharArray();
491        for (; i < length; i++) {
492          char c = chars[i];
493          if (isLowerCase(c)) {
494            chars[i] = (char) (c & 0x5f);
495          }
496        }
497        return String.valueOf(chars);
498      }
499    }
500    return string;
501  }
502
503  /**
504   * Returns a copy of the input character sequence in which all {@linkplain #isLowerCase(char)
505   * lowercase ASCII characters} have been converted to uppercase. All other characters are copied
506   * without modification.
507   *
508   * @since 14.0
509   */
510  public static String toUpperCase(CharSequence chars) {
511    if (chars instanceof String) {
512      return toUpperCase((String) chars);
513    }
514    int length = chars.length();
515    StringBuilder builder = new StringBuilder(length);
516    for (int i = 0; i < length; i++) {
517      builder.append(toUpperCase(chars.charAt(i)));
518    }
519    return builder.toString();
520  }
521
522  /**
523   * If the argument is a {@linkplain #isLowerCase(char) lowercase ASCII character} returns the
524   * uppercase equivalent. Otherwise returns the argument.
525   */
526  public static char toUpperCase(char c) {
527    return isLowerCase(c) ? (char) (c & 0x5f) : c;
528  }
529
530  /**
531   * Indicates whether {@code c} is one of the twenty-six lowercase ASCII alphabetic characters
532   * between {@code 'a'} and {@code 'z'} inclusive. All others (including non-ASCII characters)
533   * return {@code false}.
534   */
535  public static boolean isLowerCase(char c) {
536    // Note: This was benchmarked against the alternate expression "(char)(c - 'a') < 26" (Nov '13)
537    // and found to perform at least as well, or better.
538    return (c >= 'a') && (c <= 'z');
539  }
540
541  /**
542   * Indicates whether {@code c} is one of the twenty-six uppercase ASCII alphabetic characters
543   * between {@code 'A'} and {@code 'Z'} inclusive. All others (including non-ASCII characters)
544   * return {@code false}.
545   */
546  public static boolean isUpperCase(char c) {
547    return (c >= 'A') && (c <= 'Z');
548  }
549
550  /**
551   * Truncates the given character sequence to the given maximum length. If the length of the
552   * sequence is greater than {@code maxLength}, the returned string will be exactly
553   * {@code maxLength} chars in length and will end with the given {@code truncationIndicator}.
554   * Otherwise, the sequence will be returned as a string with no changes to the content.
555   *
556   * <p>Examples:
557   *
558   * <pre>   {@code
559   *   Ascii.truncate("foobar", 7, "..."); // returns "foobar"
560   *   Ascii.truncate("foobar", 5, "..."); // returns "fo..." }</pre>
561   *
562   * <p><b>Note:</b> This method <i>may</i> work with certain non-ASCII text but is not safe for
563   * use with arbitrary Unicode text. It is mostly intended for use with text that is known to be
564   * safe for use with it (such as all-ASCII text) and for simple debugging text. When using this
565   * method, consider the following:
566   *
567   * <ul>
568   *   <li>it may split surrogate pairs</li>
569   *   <li>it may split characters and combining characters</li>
570   *   <li>it does not consider word boundaries</li>
571   *   <li>if truncating for display to users, there are other considerations that must be taken
572   *   into account</li>
573   *   <li>the appropriate truncation indicator may be locale-dependent</li>
574   *   <li>it is safe to use non-ASCII characters in the truncation indicator</li>
575   * </ul>
576   *
577   *
578   * @throws IllegalArgumentException if {@code maxLength} is less than the length of
579   *     {@code truncationIndicator}
580   * @since 16.0
581   */
582  @Beta
583  @CheckReturnValue
584  public static String truncate(CharSequence seq, int maxLength, String truncationIndicator) {
585    checkNotNull(seq);
586
587    // length to truncate the sequence to, not including the truncation indicator
588    int truncationLength = maxLength - truncationIndicator.length();
589
590    // in this worst case, this allows a maxLength equal to the length of the truncationIndicator,
591    // meaning that a string will be truncated to just the truncation indicator itself
592    checkArgument(truncationLength >= 0,
593        "maxLength (%s) must be >= length of the truncation indicator (%s)",
594        maxLength, truncationIndicator.length());
595
596    if (seq.length() <= maxLength) {
597      String string = seq.toString();
598      if (string.length() <= maxLength) {
599        return string;
600      }
601      // if the length of the toString() result was > maxLength for some reason, truncate that
602      seq = string;
603    }
604
605    return new StringBuilder(maxLength)
606        .append(seq, 0, truncationLength)
607        .append(truncationIndicator)
608        .toString();
609  }
610
611  /**
612   * Indicates whether the contents of the given character sequences {@code s1} and {@code s2} are
613   * equal, ignoring the case of any ASCII alphabetic characters between {@code 'a'} and {@code 'z'}
614   * or {@code 'A'} and {@code 'Z'} inclusive.
615   *
616   * <p>This method is significantly faster than {@link String#equalsIgnoreCase} and should be used
617   * in preference if at least one of the parameters is known to contain only ASCII characters.
618   *
619   * <p>Note however that this method does not always behave identically to expressions such as:
620   * <ul>
621   * <li>{@code string.toUpperCase().equals("UPPER CASE ASCII")}
622   * <li>{@code string.toLowerCase().equals("lower case ascii")}
623   * </ul>
624   * <p>due to case-folding of some non-ASCII characters (which does not occur in
625   * {@link String#equalsIgnoreCase}). However in almost all cases that ASCII strings are used,
626   * the author probably wanted the behavior provided by this method rather than the subtle and
627   * sometimes surprising behavior of {@code toUpperCase()} and {@code toLowerCase()}.
628   *
629   * @since 16.0
630   */
631  @Beta
632  public static boolean equalsIgnoreCase(CharSequence s1, CharSequence s2) {
633    // Calling length() is the null pointer check (so do it before we can exit early).
634    int length = s1.length();
635    if (s1 == s2) {
636      return true;
637    }
638    if (length != s2.length()) {
639      return false;
640    }
641    for (int i = 0; i < length; i++) {
642      char c1 = s1.charAt(i);
643      char c2 = s2.charAt(i);
644      if (c1 == c2) {
645        continue;
646      }
647      int alphaIndex = getAlphaIndex(c1);
648      // This was also benchmarked using '&' to avoid branching (but always evaluate the rhs),
649      // however this showed no obvious improvement.
650      if (alphaIndex < 26 && alphaIndex == getAlphaIndex(c2)) {
651        continue;
652      }
653      return false;
654    }
655    return true;
656  }
657
658  /**
659   * Returns the non-negative index value of the alpha character {@code c}, regardless of case.
660   * Ie, 'a'/'A' returns 0 and 'z'/'Z' returns 25. Non-alpha characters return a value of 26 or
661   * greater.
662   */
663  private static int getAlphaIndex(char c) {
664    // Fold upper-case ASCII to lower-case and make zero-indexed and unsigned (by casting to char).
665    return (char) ((c | 0x20) - 'a');
666  }
667}