001/*
002 * Copyright (C) 2009 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017package com.google.common.net;
018
019import static com.google.common.base.Preconditions.checkArgument;
020import static com.google.common.base.Preconditions.checkNotNull;
021import static com.google.common.base.Preconditions.checkState;
022
023import com.google.common.annotations.Beta;
024import com.google.common.annotations.GwtCompatible;
025import com.google.common.base.Ascii;
026import com.google.common.base.CharMatcher;
027import com.google.common.base.Joiner;
028import com.google.common.base.Splitter;
029import com.google.common.collect.ImmutableList;
030
031import java.util.List;
032
033import javax.annotation.Nullable;
034
035/**
036 * An immutable well-formed internet domain name, such as {@code com} or {@code
037 * foo.co.uk}. Only syntactic analysis is performed; no DNS lookups or other
038 * network interactions take place. Thus there is no guarantee that the domain
039 * actually exists on the internet.
040 *
041 * <p>One common use of this class is to determine whether a given string is
042 * likely to represent an addressable domain on the web -- that is, for a
043 * candidate string {@code "xxx"}, might browsing to {@code "http://xxx/"}
044 * result in a webpage being displayed? In the past, this test was frequently
045 * done by determining whether the domain ended with a {@linkplain
046 * #isPublicSuffix() public suffix} but was not itself a public suffix. However,
047 * this test is no longer accurate. There are many domains which are both public
048 * suffixes and addressable as hosts; {@code "uk.com"} is one example. As a
049 * result, the only useful test to determine if a domain is a plausible web host
050 * is {@link #hasPublicSuffix()}. This will return {@code true} for many domains
051 * which (currently) are not hosts, such as {@code "com"}, but given that any
052 * public suffix may become a host without warning, it is better to err on the
053 * side of permissiveness and thus avoid spurious rejection of valid sites.
054 *
055 * <p>During construction, names are normalized in two ways:
056 * <ol>
057 * <li>ASCII uppercase characters are converted to lowercase.
058 * <li>Unicode dot separators other than the ASCII period ({@code '.'}) are
059 * converted to the ASCII period.
060 * </ol>
061 * <p>The normalized values will be returned from {@link #name()} and
062 * {@link #parts()}, and will be reflected in the result of
063 * {@link #equals(Object)}.
064 *
065 * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name">
066 * Internationalized domain names</a> such as {@code 网络.cn} are supported, as
067 * are the equivalent <a
068 * href="http://en.wikipedia.org/wiki/Internationalized_domain_name">IDNA
069 * Punycode-encoded</a> versions.
070 *
071 * @author Craig Berry
072 * @since 5.0
073 */
074@Beta
075@GwtCompatible
076public final class InternetDomainName {
077
078  private static final CharMatcher DOTS_MATCHER =
079      CharMatcher.anyOf(".\u3002\uFF0E\uFF61");
080  private static final Splitter DOT_SPLITTER = Splitter.on('.');
081  private static final Joiner DOT_JOINER = Joiner.on('.');
082
083  /**
084   * Value of {@link #publicSuffixIndex} which indicates that no public suffix
085   * was found.
086   */
087  private static final int NO_PUBLIC_SUFFIX_FOUND = -1;
088
089  private static final String DOT_REGEX = "\\.";
090
091  /**
092   * Maximum parts (labels) in a domain name. This value arises from
093   * the 255-octet limit described in
094   * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11 with
095   * the fact that the encoding of each part occupies at least two bytes
096   * (dot plus label externally, length byte plus label internally). Thus, if
097   * all labels have the minimum size of one byte, 127 of them will fit.
098   */
099  private static final int MAX_PARTS = 127;
100
101  /**
102   * Maximum length of a full domain name, including separators, and
103   * leaving room for the root label. See
104   * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
105   */
106  private static final int MAX_LENGTH = 253;
107
108  /**
109   * Maximum size of a single part of a domain name. See
110   * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
111   */
112  private static final int MAX_DOMAIN_PART_LENGTH = 63;
113
114  /**
115   * The full domain name, converted to lower case.
116   */
117  private final String name;
118
119  /**
120   * The parts of the domain name, converted to lower case.
121   */
122  private final ImmutableList<String> parts;
123
124  /**
125   * The index in the {@link #parts()} list at which the public suffix begins.
126   * For example, for the domain name {@code www.google.co.uk}, the value would
127   * be 2 (the index of the {@code co} part). The value is negative
128   * (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was
129   * found.
130   */
131  private final int publicSuffixIndex;
132
133  /**
134   * Constructor used to implement {@link #from(String)}, and from subclasses.
135   */
136  InternetDomainName(String name) {
137    // Normalize:
138    // * ASCII characters to lowercase
139    // * All dot-like characters to '.'
140    // * Strip trailing '.'
141
142    name = Ascii.toLowerCase(DOTS_MATCHER.replaceFrom(name, '.'));
143
144    if (name.endsWith(".")) {
145      name = name.substring(0, name.length() - 1);
146    }
147
148    checkArgument(name.length() <= MAX_LENGTH,
149        "Domain name too long: '%s':", name);
150    this.name = name;
151
152    this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name));
153    checkArgument(parts.size() <= MAX_PARTS,
154        "Domain has too many parts: '%s'", name);
155    checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name);
156
157    this.publicSuffixIndex = findPublicSuffix();
158  }
159
160  /**
161   * Returns the index of the leftmost part of the public suffix, or -1 if not
162   * found. Note that the value defined as the "public suffix" may not be a
163   * public suffix according to {@link #isPublicSuffix()} if the domain ends
164   * with an excluded domain pattern such as {@code "nhs.uk"}.
165   */
166  private int findPublicSuffix() {
167    final int partsSize = parts.size();
168
169    for (int i = 0; i < partsSize; i++) {
170      String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize));
171
172      if (TldPatterns.EXACT.contains(ancestorName)) {
173        return i;
174      }
175
176      // Excluded domains (e.g. !nhs.uk) use the next highest
177      // domain as the effective public suffix (e.g. uk).
178
179      if (TldPatterns.EXCLUDED.contains(ancestorName)) {
180        return i + 1;
181      }
182
183      if (matchesWildcardPublicSuffix(ancestorName)) {
184        return i;
185      }
186    }
187
188    return NO_PUBLIC_SUFFIX_FOUND;
189  }
190
191  /**
192   * A deprecated synonym for {@link #from(String)}.
193   *
194   * @param domain A domain name (not IP address)
195   * @throws IllegalArgumentException if {@code name} is not syntactically valid
196   *     according to {@link #isValid}
197   * @since 8.0 (previously named {@code from})
198   * @deprecated Use {@link #from(String)}
199   */
200  @Deprecated
201  public static InternetDomainName fromLenient(String domain) {
202    return from(domain);
203  }
204
205  /**
206   * Returns an instance of {@link InternetDomainName} after lenient
207   * validation.  Specifically, validation against <a
208   * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
209   * ("Internationalizing Domain Names in Applications") is skipped, while
210   * validation against <a
211   * href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a> is relaxed in
212   * the following ways:
213   * <ul>
214   * <li>Any part containing non-ASCII characters is considered valid.
215   * <li>Underscores ('_') are permitted wherever dashes ('-') are permitted.
216   * <li>Parts other than the final part may start with a digit.
217   * </ul>
218   *
219   *
220   * @param domain A domain name (not IP address)
221   * @throws IllegalArgumentException if {@code name} is not syntactically valid
222   *     according to {@link #isValid}
223   * @since 10.0 (previously named {@code fromLenient})
224   */
225  public static InternetDomainName from(String domain) {
226    return new InternetDomainName(checkNotNull(domain));
227  }
228
229  /**
230   * Validation method used by {@from} to ensure that the domain name is
231   * syntactically valid according to RFC 1035.
232   *
233   * @return Is the domain name syntactically valid?
234   */
235  private static boolean validateSyntax(List<String> parts) {
236    final int lastIndex = parts.size() - 1;
237
238    // Validate the last part specially, as it has different syntax rules.
239
240    if (!validatePart(parts.get(lastIndex), true)) {
241      return false;
242    }
243
244    for (int i = 0; i < lastIndex; i++) {
245      String part = parts.get(i);
246      if (!validatePart(part, false)) {
247        return false;
248      }
249    }
250
251    return true;
252  }
253
254  private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_");
255
256  private static final CharMatcher PART_CHAR_MATCHER =
257      CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER);
258
259  /**
260   * Helper method for {@link #validateSyntax(List)}. Validates that one part of
261   * a domain name is valid.
262   *
263   * @param part The domain name part to be validated
264   * @param isFinalPart Is this the final (rightmost) domain part?
265   * @return Whether the part is valid
266   */
267  private static boolean validatePart(String part, boolean isFinalPart) {
268
269    // These tests could be collapsed into one big boolean expression, but
270    // they have been left as independent tests for clarity.
271
272    if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) {
273      return false;
274    }
275
276    /*
277     * GWT claims to support java.lang.Character's char-classification methods,
278     * but it actually only works for ASCII. So for now, assume any non-ASCII
279     * characters are valid. The only place this seems to be documented is here:
280     * http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html
281     *
282     * <p>ASCII characters in the part are expected to be valid per RFC 1035,
283     * with underscore also being allowed due to widespread practice.
284     */
285
286    String asciiChars = CharMatcher.ASCII.retainFrom(part);
287
288    if (!PART_CHAR_MATCHER.matchesAllOf(asciiChars)) {
289      return false;
290    }
291
292    // No initial or final dashes or underscores.
293
294    if (DASH_MATCHER.matches(part.charAt(0))
295        || DASH_MATCHER.matches(part.charAt(part.length() - 1))) {
296      return false;
297    }
298
299    /*
300     * Note that we allow (in contravention of a strict interpretation of the
301     * relevant RFCs) domain parts other than the last may begin with a digit
302     * (for example, "3com.com"). It's important to disallow an initial digit in
303     * the last part; it's the only thing that stops an IPv4 numeric address
304     * like 127.0.0.1 from looking like a valid domain name.
305     */
306
307    if (isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) {
308      return false;
309    }
310
311    return true;
312  }
313
314  /**
315   * A deprecated synonym for {@link #toString()}.
316   *
317   * @deprecated Use {@link #toString()}
318   */
319  @Deprecated
320  public String name() {
321    return toString();
322  }
323
324  /**
325   * Returns the individual components of this domain name, normalized to all
326   * lower case. For example, for the domain name {@code mail.google.com}, this
327   * method returns the list {@code ["mail", "google", "com"]}.
328   */
329  public ImmutableList<String> parts() {
330    return parts;
331  }
332
333  /**
334   * Indicates whether this domain name represents a <i>public suffix</i>, as
335   * defined by the Mozilla Foundation's
336   * <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public
337   * suffix is one under which Internet users can directly register names, such
338   * as {@code com}, {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain
339   * names that are <i>not</i> public suffixes include {@code google}, {@code
340   * google.com} and {@code foo.co.uk}.
341   *
342   * @return {@code true} if this domain name appears exactly on the public
343   *     suffix list
344   * @since 6.0
345   */
346  public boolean isPublicSuffix() {
347    return publicSuffixIndex == 0;
348  }
349
350  /**
351   * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
352   * public suffix}, including if it is a public suffix itself. For example,
353   * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
354   * {@code com}, but not for {@code google} or {@code google.foo}. This is
355   * the recommended method for determining whether a domain is potentially an
356   * addressable host.
357   *
358   * @since 6.0
359   */
360  public boolean hasPublicSuffix() {
361    return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND;
362  }
363
364  /**
365   * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the
366   * domain name, or {@code null} if no public suffix is present.
367   *
368   * @since 6.0
369   */
370  public InternetDomainName publicSuffix() {
371    return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null;
372  }
373
374  /**
375   * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
376   * public suffix}, while not being a public suffix itself. For example,
377   * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
378   * {@code bar.ca.us}, but not for {@code google}, {@code com}, or {@code
379   * google.foo}.
380   *
381   * <p><b>Warning:</b> a {@code false} result from this method does not imply
382   * that the domain does not represent an addressable host, as many public
383   * suffixes are also addressable hosts. Use {@link #hasPublicSuffix()} for
384   * that test.
385   *
386   * <p>This method can be used to determine whether it will probably be
387   * possible to set cookies on the domain, though even that depends on
388   * individual browsers' implementations of cookie controls. See
389   * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
390   *
391   * @since 6.0
392   */
393  public boolean isUnderPublicSuffix() {
394    return publicSuffixIndex > 0;
395  }
396
397  /**
398   * Indicates whether this domain name is composed of exactly one subdomain
399   * component followed by a {@linkplain #isPublicSuffix() public suffix}. For
400   * example, returns {@code true} for {@code google.com} and {@code foo.co.uk},
401   * but not for {@code www.google.com} or {@code co.uk}.
402   *
403   * <p><b>Warning:</b> A {@code true} result from this method does not imply
404   * that the domain is at the highest level which is addressable as a host, as
405   * many public suffixes are also addressable hosts. For example, the domain
406   * {@code bar.uk.com} has a public suffix of {@code uk.com}, so it would
407   * return {@code true} from this method. But {@code uk.com} is itself an
408   * addressable host.
409   *
410   * <p>This method can be used to determine whether a domain is probably the
411   * highest level for which cookies may be set, though even that depends on
412   * individual browsers' implementations of cookie controls. See
413   * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
414   *
415   * @since 6.0
416   */
417  public boolean isTopPrivateDomain() {
418    return publicSuffixIndex == 1;
419  }
420
421  /**
422   * Returns the portion of this domain name that is one level beneath the
423   * public suffix. For example, for {@code x.adwords.google.co.uk} it returns
424   * {@code google.co.uk}, since {@code co.uk} is a public suffix.
425   *
426   * <p>If {@link #isTopPrivateDomain()} is true, the current domain name
427   * instance is returned.
428   *
429   * <p>This method should not be used to determine the topmost parent domain
430   * which is addressable as a host, as many public suffixes are also
431   * addressable hosts. For example, the domain {@code foo.bar.uk.com} has
432   * a public suffix of {@code uk.com}, so it would return {@code bar.uk.com}
433   * from this method. But {@code uk.com} is itself an addressable host.
434   *
435   * <p>This method can be used to determine the probable highest level parent
436   * domain for which cookies may be set, though even that depends on individual
437   * browsers' implementations of cookie controls.
438   *
439   * @throws IllegalStateException if this domain does not end with a
440   *     public suffix
441   * @since 6.0
442   */
443  public InternetDomainName topPrivateDomain() {
444    if (isTopPrivateDomain()) {
445      return this;
446    }
447    checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name);
448    return ancestor(publicSuffixIndex - 1);
449  }
450
451  /**
452   * Indicates whether this domain is composed of two or more parts.
453   */
454  public boolean hasParent() {
455    return parts.size() > 1;
456  }
457
458  /**
459   * Returns an {@code InternetDomainName} that is the immediate ancestor of
460   * this one; that is, the current domain with the leftmost part removed. For
461   * example, the parent of {@code www.google.com} is {@code google.com}.
462   *
463   * @throws IllegalStateException if the domain has no parent, as determined
464   *     by {@link #hasParent}
465   */
466  public InternetDomainName parent() {
467    checkState(hasParent(), "Domain '%s' has no parent", name);
468    return ancestor(1);
469  }
470
471  /**
472   * Returns the ancestor of the current domain at the given number of levels
473   * "higher" (rightward) in the subdomain list. The number of levels must be
474   * non-negative, and less than {@code N-1}, where {@code N} is the number of
475   * parts in the domain.
476   *
477   * <p>TODO: Reasonable candidate for addition to public API.
478   */
479  private InternetDomainName ancestor(int levels) {
480    return from(DOT_JOINER.join(parts.subList(levels, parts.size())));
481  }
482
483  /**
484   * Creates and returns a new {@code InternetDomainName} by prepending the
485   * argument and a dot to the current name. For example, {@code
486   * InternetDomainName.from("foo.com").child("www.bar")} returns a new
487   * {@code InternetDomainName} with the value {@code www.bar.foo.com}. Only
488   * lenient validation is performed, as described {@link #from(String) here}.
489   *
490   * @throws NullPointerException if leftParts is null
491   * @throws IllegalArgumentException if the resulting name is not valid
492   */
493  public InternetDomainName child(String leftParts) {
494    return from(checkNotNull(leftParts) + "." + name);
495  }
496
497  /**
498   * Indicates whether the argument is a syntactically valid domain name using
499   * lenient validation. Specifically, validation against <a
500   * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
501   * ("Internationalizing Domain Names in Applications") is skipped.
502   *
503   * <p>The following two code snippets are equivalent:
504   *
505   * <pre>   {@code
506   *   domainName = InternetDomainName.isValid(name)
507   *       ? InternetDomainName.from(name)
508   *       : DEFAULT_DOMAIN;}</pre>
509   *
510   * <pre>   {@code
511   *   try {
512   *     domainName = InternetDomainName.from(name);
513   *   } catch (IllegalArgumentException e) {
514   *     domainName = DEFAULT_DOMAIN;
515   *   }}</pre>
516   *
517   * @since 8.0 (previously named {@code isValidLenient})
518   */
519  public static boolean isValid(String name) {
520    try {
521      from(name);
522      return true;
523    } catch (IllegalArgumentException e) {
524      return false;
525    }
526  }
527
528  /**
529   * Does the domain name match one of the "wildcard" patterns (e.g.
530   * {@code "*.ar"})?
531   */
532  private static boolean matchesWildcardPublicSuffix(String domain) {
533    final String[] pieces = domain.split(DOT_REGEX, 2);
534    return pieces.length == 2 && TldPatterns.UNDER.contains(pieces[1]);
535  }
536
537  /**
538   * Returns the domain name, normalized to all lower case.
539   */
540  @Override
541  public String toString() {
542    return name;
543  }
544
545  /**
546   * Equality testing is based on the text supplied by the caller,
547   * after normalization as described in the class documentation. For
548   * example, a non-ASCII Unicode domain name and the Punycode version
549   * of the same domain name would not be considered equal.
550   *
551   */
552  @Override
553  public boolean equals(@Nullable Object object) {
554    if (object == this) {
555      return true;
556    }
557
558    if (object instanceof InternetDomainName) {
559      InternetDomainName that = (InternetDomainName) object;
560      return this.name.equals(that.name);
561    }
562
563    return false;
564  }
565
566  @Override
567  public int hashCode() {
568    return name.hashCode();
569  }
570}