001/*
002 * Copyright (C) 2009 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
005 * in compliance with the License. You may obtain a copy of the License at
006 *
007 * http://www.apache.org/licenses/LICENSE-2.0
008 *
009 * Unless required by applicable law or agreed to in writing, software distributed under the License
010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
011 * or implied. See the License for the specific language governing permissions and limitations under
012 * the License.
013 */
014
015package com.google.common.net;
016
017import static com.google.common.base.Preconditions.checkArgument;
018import static com.google.common.base.Preconditions.checkNotNull;
019import static com.google.common.base.Preconditions.checkState;
020
021import com.google.common.annotations.Beta;
022import com.google.common.annotations.GwtCompatible;
023import com.google.common.base.Ascii;
024import com.google.common.base.CharMatcher;
025import com.google.common.base.Joiner;
026import com.google.common.base.Splitter;
027import com.google.common.collect.ImmutableList;
028import com.google.thirdparty.publicsuffix.PublicSuffixPatterns;
029import java.util.List;
030import javax.annotation.Nullable;
031
032/**
033 * An immutable well-formed internet domain name, such as {@code com} or {@code
034 * foo.co.uk}. Only syntactic analysis is performed; no DNS lookups or other network interactions
035 * take place. Thus there is no guarantee that the domain actually exists on the internet.
036 *
037 * <p>One common use of this class is to determine whether a given string is likely to represent an
038 * addressable domain on the web -- that is, for a candidate string {@code "xxx"}, might browsing to
039 * {@code "http://xxx/"} result in a webpage being displayed? In the past, this test was frequently
040 * done by determining whether the domain ended with a {@linkplain #isPublicSuffix() public suffix}
041 * but was not itself a public suffix. However, this test is no longer accurate. There are many
042 * domains which are both public suffixes and addressable as hosts; {@code "uk.com"} is one example.
043 * As a result, the only useful test to determine if a domain is a plausible web host is
044 * {@link #hasPublicSuffix()}. This will return {@code true} for many domains which (currently) are
045 * not hosts, such as {@code "com"}, but given that any public suffix may become a host without
046 * warning, it is better to err on the side of permissiveness and thus avoid spurious rejection of
047 * valid sites.
048 *
049 * <p>During construction, names are normalized in two ways:
050 *
051 * <ol>
052 * <li>ASCII uppercase characters are converted to lowercase.
053 * <li>Unicode dot separators other than the ASCII period ({@code '.'}) are converted to the ASCII
054 *     period.
055 * </ol>
056 *
057 * <p>The normalized values will be returned from {@link #toString()} and {@link #parts()}, and will
058 * be reflected in the result of {@link #equals(Object)}.
059 *
060 * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name">Internationalized domain
061 * names</a> such as {@code 网络.cn} are supported, as are the equivalent
062 * <a href="http://en.wikipedia.org/wiki/Internationalized_domain_name">IDNA Punycode-encoded</a>
063 * versions.
064 *
065 * @author Craig Berry
066 * @since 5.0
067 */
068@Beta
069@GwtCompatible
070public final class InternetDomainName {
071
072  private static final CharMatcher DOTS_MATCHER = CharMatcher.anyOf(".\u3002\uFF0E\uFF61");
073  private static final Splitter DOT_SPLITTER = Splitter.on('.');
074  private static final Joiner DOT_JOINER = Joiner.on('.');
075
076  /**
077   * Value of {@link #publicSuffixIndex} which indicates that no public suffix was found.
078   */
079  private static final int NO_PUBLIC_SUFFIX_FOUND = -1;
080
081  private static final String DOT_REGEX = "\\.";
082
083  /**
084   * Maximum parts (labels) in a domain name. This value arises from the 255-octet limit described
085   * in <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11 with the fact that the
086   * encoding of each part occupies at least two bytes (dot plus label externally, length byte plus
087   * label internally). Thus, if all labels have the minimum size of one byte, 127 of them will fit.
088   */
089  private static final int MAX_PARTS = 127;
090
091  /**
092   * Maximum length of a full domain name, including separators, and leaving room for the root
093   * label. See <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
094   */
095  private static final int MAX_LENGTH = 253;
096
097  /**
098   * Maximum size of a single part of a domain name. See
099   * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
100   */
101  private static final int MAX_DOMAIN_PART_LENGTH = 63;
102
103  /**
104   * The full domain name, converted to lower case.
105   */
106  private final String name;
107
108  /**
109   * The parts of the domain name, converted to lower case.
110   */
111  private final ImmutableList<String> parts;
112
113  /**
114   * The index in the {@link #parts()} list at which the public suffix begins. For example, for the
115   * domain name {@code www.google.co.uk}, the value would be 2 (the index of the {@code co} part).
116   * The value is negative (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was
117   * found.
118   */
119  private final int publicSuffixIndex;
120
121  /**
122   * Constructor used to implement {@link #from(String)}, and from subclasses.
123   */
124  InternetDomainName(String name) {
125    // Normalize:
126    // * ASCII characters to lowercase
127    // * All dot-like characters to '.'
128    // * Strip trailing '.'
129
130    name = Ascii.toLowerCase(DOTS_MATCHER.replaceFrom(name, '.'));
131
132    if (name.endsWith(".")) {
133      name = name.substring(0, name.length() - 1);
134    }
135
136    checkArgument(name.length() <= MAX_LENGTH, "Domain name too long: '%s':", name);
137    this.name = name;
138
139    this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name));
140    checkArgument(parts.size() <= MAX_PARTS, "Domain has too many parts: '%s'", name);
141    checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name);
142
143    this.publicSuffixIndex = findPublicSuffix();
144  }
145
146  /**
147   * Returns the index of the leftmost part of the public suffix, or -1 if not found. Note that the
148   * value defined as the "public suffix" may not be a public suffix according to
149   * {@link #isPublicSuffix()} if the domain ends with an excluded domain pattern such as
150   * {@code "nhs.uk"}.
151   */
152  private int findPublicSuffix() {
153    final int partsSize = parts.size();
154
155    for (int i = 0; i < partsSize; i++) {
156      String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize));
157
158      if (PublicSuffixPatterns.EXACT.containsKey(ancestorName)) {
159        return i;
160      }
161
162      // Excluded domains (e.g. !nhs.uk) use the next highest
163      // domain as the effective public suffix (e.g. uk).
164
165      if (PublicSuffixPatterns.EXCLUDED.containsKey(ancestorName)) {
166        return i + 1;
167      }
168
169      if (matchesWildcardPublicSuffix(ancestorName)) {
170        return i;
171      }
172    }
173
174    return NO_PUBLIC_SUFFIX_FOUND;
175  }
176
177  /**
178   * Returns an instance of {@link InternetDomainName} after lenient validation. Specifically,
179   * validation against <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
180   * ("Internationalizing Domain Names in Applications") is skipped, while validation against
181   * <a href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a> is relaxed in the following ways:
182   * <ul>
183   * <li>Any part containing non-ASCII characters is considered valid.
184   * <li>Underscores ('_') are permitted wherever dashes ('-') are permitted.
185   * <li>Parts other than the final part may start with a digit, as mandated by
186   * <a href="https://tools.ietf.org/html/rfc1123#section-2">RFC 1123</a>.
187   * </ul>
188   *
189   *
190   * @param domain A domain name (not IP address)
191   * @throws IllegalArgumentException if {@code name} is not syntactically valid according to
192   *     {@link #isValid}
193   * @since 10.0 (previously named {@code fromLenient})
194   */
195  public static InternetDomainName from(String domain) {
196    return new InternetDomainName(checkNotNull(domain));
197  }
198
199  /**
200   * Validation method used by {@code from} to ensure that the domain name is syntactically valid
201   * according to RFC 1035.
202   *
203   * @return Is the domain name syntactically valid?
204   */
205  private static boolean validateSyntax(List<String> parts) {
206    final int lastIndex = parts.size() - 1;
207
208    // Validate the last part specially, as it has different syntax rules.
209
210    if (!validatePart(parts.get(lastIndex), true)) {
211      return false;
212    }
213
214    for (int i = 0; i < lastIndex; i++) {
215      String part = parts.get(i);
216      if (!validatePart(part, false)) {
217        return false;
218      }
219    }
220
221    return true;
222  }
223
224  private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_");
225
226  private static final CharMatcher PART_CHAR_MATCHER =
227      CharMatcher.javaLetterOrDigit().or(DASH_MATCHER);
228
229  /**
230   * Helper method for {@link #validateSyntax(List)}. Validates that one part of a domain name is
231   * valid.
232   *
233   * @param part The domain name part to be validated
234   * @param isFinalPart Is this the final (rightmost) domain part?
235   * @return Whether the part is valid
236   */
237  private static boolean validatePart(String part, boolean isFinalPart) {
238
239    // These tests could be collapsed into one big boolean expression, but
240    // they have been left as independent tests for clarity.
241
242    if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) {
243      return false;
244    }
245
246    /*
247     * GWT claims to support java.lang.Character's char-classification methods, but it actually only
248     * works for ASCII. So for now, assume any non-ASCII characters are valid. The only place this
249     * seems to be documented is here:
250     * http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html
251     *
252     * <p>ASCII characters in the part are expected to be valid per RFC 1035, with underscore also
253     * being allowed due to widespread practice.
254     */
255
256    String asciiChars = CharMatcher.ascii().retainFrom(part);
257
258    if (!PART_CHAR_MATCHER.matchesAllOf(asciiChars)) {
259      return false;
260    }
261
262    // No initial or final dashes or underscores.
263
264    if (DASH_MATCHER.matches(part.charAt(0))
265        || DASH_MATCHER.matches(part.charAt(part.length() - 1))) {
266      return false;
267    }
268
269    /*
270     * Note that we allow (in contravention of a strict interpretation of the relevant RFCs) domain
271     * parts other than the last may begin with a digit (for example, "3com.com"). It's important to
272     * disallow an initial digit in the last part; it's the only thing that stops an IPv4 numeric
273     * address like 127.0.0.1 from looking like a valid domain name.
274     */
275
276    if (isFinalPart && CharMatcher.digit().matches(part.charAt(0))) {
277      return false;
278    }
279
280    return true;
281  }
282
283  /**
284   * Returns the individual components of this domain name, normalized to all lower case. For
285   * example, for the domain name {@code mail.google.com}, this method returns the list
286   * {@code ["mail", "google", "com"]}.
287   */
288  public ImmutableList<String> parts() {
289    return parts;
290  }
291
292  /**
293   * Indicates whether this domain name represents a <i>public suffix</i>, as defined by the Mozilla
294   * Foundation's <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public suffix
295   * is one under which Internet users can directly register names, such as {@code com},
296   * {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain names that are <i>not</i> public
297   * suffixes include {@code google}, {@code google.com} and {@code foo.co.uk}.
298   *
299   * @return {@code true} if this domain name appears exactly on the public suffix list
300   * @since 6.0
301   */
302  public boolean isPublicSuffix() {
303    return publicSuffixIndex == 0;
304  }
305
306  /**
307   * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() public suffix},
308   * including if it is a public suffix itself. For example, returns {@code true} for
309   * {@code www.google.com}, {@code foo.co.uk} and {@code com}, but not for {@code google} or
310   * {@code google.foo}. This is the recommended method for determining whether a domain is
311   * potentially an addressable host.
312   *
313   * @since 6.0
314   */
315  public boolean hasPublicSuffix() {
316    return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND;
317  }
318
319  /**
320   * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the domain name, or
321   * {@code null} if no public suffix is present.
322   *
323   * @since 6.0
324   */
325  public InternetDomainName publicSuffix() {
326    return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null;
327  }
328
329  /**
330   * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() public suffix},
331   * while not being a public suffix itself. For example, returns {@code true} for
332   * {@code www.google.com}, {@code foo.co.uk} and {@code bar.ca.us}, but not for {@code google},
333   * {@code com}, or {@code
334   * google.foo}.
335   *
336   * <p><b>Warning:</b> a {@code false} result from this method does not imply that the domain does
337   * not represent an addressable host, as many public suffixes are also addressable hosts. Use
338   * {@link #hasPublicSuffix()} for that test.
339   *
340   * <p>This method can be used to determine whether it will probably be possible to set cookies on
341   * the domain, though even that depends on individual browsers' implementations of cookie
342   * controls. See <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
343   *
344   * @since 6.0
345   */
346  public boolean isUnderPublicSuffix() {
347    return publicSuffixIndex > 0;
348  }
349
350  /**
351   * Indicates whether this domain name is composed of exactly one subdomain component followed by a
352   * {@linkplain #isPublicSuffix() public suffix}. For example, returns {@code true} for
353   * {@code google.com} and {@code foo.co.uk}, but not for {@code www.google.com} or {@code co.uk}.
354   *
355   * <p><b>Warning:</b> A {@code true} result from this method does not imply that the domain is at
356   * the highest level which is addressable as a host, as many public suffixes are also addressable
357   * hosts. For example, the domain {@code bar.uk.com} has a public suffix of {@code uk.com}, so it
358   * would return {@code true} from this method. But {@code uk.com} is itself an addressable host.
359   *
360   * <p>This method can be used to determine whether a domain is probably the highest level for
361   * which cookies may be set, though even that depends on individual browsers' implementations of
362   * cookie controls. See <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
363   *
364   * @since 6.0
365   */
366  public boolean isTopPrivateDomain() {
367    return publicSuffixIndex == 1;
368  }
369
370  /**
371   * Returns the portion of this domain name that is one level beneath the public suffix. For
372   * example, for {@code x.adwords.google.co.uk} it returns {@code google.co.uk}, since
373   * {@code co.uk} is a public suffix.
374   *
375   * <p>If {@link #isTopPrivateDomain()} is true, the current domain name instance is returned.
376   *
377   * <p>This method should not be used to determine the topmost parent domain which is addressable
378   * as a host, as many public suffixes are also addressable hosts. For example, the domain
379   * {@code foo.bar.uk.com} has a public suffix of {@code uk.com}, so it would return
380   * {@code bar.uk.com} from this method. But {@code uk.com} is itself an addressable host.
381   *
382   * <p>This method can be used to determine the probable highest level parent domain for which
383   * cookies may be set, though even that depends on individual browsers' implementations of cookie
384   * controls.
385   *
386   * @throws IllegalStateException if this domain does not end with a public suffix
387   * @since 6.0
388   */
389  public InternetDomainName topPrivateDomain() {
390    if (isTopPrivateDomain()) {
391      return this;
392    }
393    checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name);
394    return ancestor(publicSuffixIndex - 1);
395  }
396
397  /**
398   * Indicates whether this domain is composed of two or more parts.
399   */
400  public boolean hasParent() {
401    return parts.size() > 1;
402  }
403
404  /**
405   * Returns an {@code InternetDomainName} that is the immediate ancestor of this one; that is, the
406   * current domain with the leftmost part removed. For example, the parent of
407   * {@code www.google.com} is {@code google.com}.
408   *
409   * @throws IllegalStateException if the domain has no parent, as determined by {@link #hasParent}
410   */
411  public InternetDomainName parent() {
412    checkState(hasParent(), "Domain '%s' has no parent", name);
413    return ancestor(1);
414  }
415
416  /**
417   * Returns the ancestor of the current domain at the given number of levels "higher" (rightward)
418   * in the subdomain list. The number of levels must be non-negative, and less than {@code N-1},
419   * where {@code N} is the number of parts in the domain.
420   *
421   * <p>TODO: Reasonable candidate for addition to public API.
422   */
423  private InternetDomainName ancestor(int levels) {
424    return from(DOT_JOINER.join(parts.subList(levels, parts.size())));
425  }
426
427  /**
428   * Creates and returns a new {@code InternetDomainName} by prepending the argument and a dot to
429   * the current name. For example, {@code
430   * InternetDomainName.from("foo.com").child("www.bar")} returns a new {@code InternetDomainName}
431   * with the value {@code www.bar.foo.com}. Only lenient validation is performed, as described
432   * {@link #from(String) here}.
433   *
434   * @throws NullPointerException if leftParts is null
435   * @throws IllegalArgumentException if the resulting name is not valid
436   */
437  public InternetDomainName child(String leftParts) {
438    return from(checkNotNull(leftParts) + "." + name);
439  }
440
441  /**
442   * Indicates whether the argument is a syntactically valid domain name using lenient validation.
443   * Specifically, validation against <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
444   * ("Internationalizing Domain Names in Applications") is skipped.
445   *
446   * <p>The following two code snippets are equivalent:
447   *
448   * <pre>   {@code
449   *   domainName = InternetDomainName.isValid(name)
450   *       ? InternetDomainName.from(name)
451   *       : DEFAULT_DOMAIN;}</pre>
452   *
453   * <pre>   {@code
454   *   try {
455   *     domainName = InternetDomainName.from(name);
456   *   } catch (IllegalArgumentException e) {
457   *     domainName = DEFAULT_DOMAIN;
458   *   }}</pre>
459   *
460   * @since 8.0 (previously named {@code isValidLenient})
461   */
462  public static boolean isValid(String name) {
463    try {
464      from(name);
465      return true;
466    } catch (IllegalArgumentException e) {
467      return false;
468    }
469  }
470
471  /**
472   * Does the domain name match one of the "wildcard" patterns (e.g. {@code "*.ar"})?
473   */
474  private static boolean matchesWildcardPublicSuffix(String domain) {
475    final String[] pieces = domain.split(DOT_REGEX, 2);
476    return pieces.length == 2 && PublicSuffixPatterns.UNDER.containsKey(pieces[1]);
477  }
478
479  /**
480   * Returns the domain name, normalized to all lower case.
481   */
482  @Override
483  public String toString() {
484    return name;
485  }
486
487  /**
488   * Equality testing is based on the text supplied by the caller, after normalization as described
489   * in the class documentation. For example, a non-ASCII Unicode domain name and the Punycode
490   * version of the same domain name would not be considered equal.
491   *
492   */
493  @Override
494  public boolean equals(@Nullable Object object) {
495    if (object == this) {
496      return true;
497    }
498
499    if (object instanceof InternetDomainName) {
500      InternetDomainName that = (InternetDomainName) object;
501      return this.name.equals(that.name);
502    }
503
504    return false;
505  }
506
507  @Override
508  public int hashCode() {
509    return name.hashCode();
510  }
511}