001    /*
002     * Copyright (C) 2009 The Guava Authors
003     *
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     *
008     * http://www.apache.org/licenses/LICENSE-2.0
009     *
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package com.google.common.net;
018    
019    import static com.google.common.base.Preconditions.checkArgument;
020    import static com.google.common.base.Preconditions.checkNotNull;
021    import static com.google.common.base.Preconditions.checkState;
022    
023    import com.google.common.annotations.Beta;
024    import com.google.common.annotations.GwtCompatible;
025    import com.google.common.base.Ascii;
026    import com.google.common.base.CharMatcher;
027    import com.google.common.base.Joiner;
028    import com.google.common.base.Objects;
029    import com.google.common.base.Splitter;
030    import com.google.common.collect.ImmutableList;
031    
032    import java.util.List;
033    
034    import javax.annotation.Nullable;
035    
036    /**
037     * An immutable well-formed internet domain name, such as {@code com} or {@code
038     * foo.co.uk}. Only syntactic analysis is performed; no DNS lookups or other
039     * network interactions take place. Thus there is no guarantee that the domain
040     * actually exists on the internet.
041     *
042     * <p>One common use of this class is to determine whether a given string is
043     * likely to represent an addressable domain on the web -- that is, for a
044     * candidate string {@code "xxx"}, might browsing to {@code "http://xxx/"}
045     * result in a webpage being displayed? In the past, this test was frequently
046     * done by determining whether the domain ended with a {@linkplain
047     * #isPublicSuffix() public suffix} but was not itself a public suffix. However,
048     * this test is no longer accurate. There are many domains which are both public
049     * suffixes and addressable as hosts; {@code "uk.com"} is one example. As a
050     * result, the only useful test to determine if a domain is a plausible web host
051     * is {@link #hasPublicSuffix()}. This will return {@code true} for many domains
052     * which (currently) are not hosts, such as {@code "com"}), but given that any
053     * public suffix may become a host without warning, it is better to err on the
054     * side of permissiveness and thus avoid spurious rejection of valid sites.
055     *
056     * <p>During construction, names are normalized in two ways:
057     * <ol>
058     * <li>ASCII uppercase characters are converted to lowercase.
059     * <li>Unicode dot separators other than the ASCII period ({@code '.'}) are
060     * converted to the ASCII period.
061     * </ol>
062     * The normalized values will be returned from {@link #name()} and
063     * {@link #parts()}, and will be reflected in the result of
064     * {@link #equals(Object)}.
065     *
066     * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name">
067     * internationalized domain names</a> such as {@code 网络.cn} are supported, as
068     * are the equivalent <a
069     * href="http://en.wikipedia.org/wiki/Internationalized_domain_name">IDNA
070     * Punycode-encoded</a> versions.
071     *
072     * @author Craig Berry
073     * @since 5.0
074     */
075    @Beta
076    @GwtCompatible(emulated = true)
077    public final class InternetDomainName {
078    
079      private static final CharMatcher DOTS_MATCHER =
080          CharMatcher.anyOf(".\u3002\uFF0E\uFF61");
081      private static final Splitter DOT_SPLITTER = Splitter.on('.');
082      private static final Joiner DOT_JOINER = Joiner.on('.');
083    
084      /**
085       * Value of {@link #publicSuffixIndex} which indicates that no public suffix
086       * was found.
087       */
088      private static final int NO_PUBLIC_SUFFIX_FOUND = -1;
089    
090      private static final String DOT_REGEX = "\\.";
091    
092      /**
093       * Maximum parts (labels) in a domain name. This value arises from
094       * the 255-octet limit described in
095       * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11 with
096       * the fact that the encoding of each part occupies at least two bytes
097       * (dot plus label externally, length byte plus label internally). Thus, if
098       * all labels have the minimum size of one byte, 127 of them will fit.
099       */
100      private static final int MAX_PARTS = 127;
101    
102      /**
103       * Maximum length of a full domain name, including separators, and
104       * leaving room for the root label. See
105       * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
106       */
107      private static final int MAX_LENGTH = 253;
108    
109      /**
110       * Maximum size of a single part of a domain name. See
111       * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
112       */
113      private static final int MAX_DOMAIN_PART_LENGTH = 63;
114    
115      /**
116       * The full domain name, converted to lower case.
117       */
118      private final String name;
119    
120      /**
121       * The parts of the domain name, converted to lower case.
122       */
123      private final ImmutableList<String> parts;
124    
125      /**
126       * The index in the {@link #parts()} list at which the public suffix begins.
127       * For example, for the domain name {@code www.google.co.uk}, the value would
128       * be 2 (the index of the {@code co} part). The value is negative
129       * (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was
130       * found.
131       */
132      private final int publicSuffixIndex;
133    
134      /**
135       * Constructor used to implement {@link #from(String)}, and from subclasses.
136       */
137      InternetDomainName(String name) {
138        // Normalize:
139        // * ASCII characters to lowercase
140        // * All dot-like characters to '.'
141        // * Strip trailing '.'
142    
143        name = Ascii.toLowerCase(DOTS_MATCHER.replaceFrom(name, '.'));
144    
145        if (name.endsWith(".")) {
146          name = name.substring(0, name.length() - 1);
147        }
148    
149        checkArgument(name.length() <= MAX_LENGTH, "Domain name too long: '%s':", name);
150        this.name = name;
151    
152        this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name));
153        checkArgument(parts.size() <= MAX_PARTS, "Domain has too many parts: '%s'", name);
154        checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name);
155    
156        this.publicSuffixIndex = findPublicSuffix();
157      }
158    
159      /**
160       * Returns the index of the leftmost part of the public suffix, or -1 if not
161       * found. Note that the value defined as the "public suffix" may not be a
162       * public suffix according to {@link #isPublicSuffix()} if the domain ends
163       * with an excluded domain pattern such as {@code "nhs.uk"}.
164       */
165      private int findPublicSuffix() {
166        final int partsSize = parts.size();
167    
168        for (int i = 0; i < partsSize; i++) {
169          String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize));
170    
171          if (TldPatterns.EXACT.contains(ancestorName)) {
172            return i;
173          }
174    
175          // Excluded domains (e.g. !nhs.uk) use the next highest
176          // domain as the effective public suffix (e.g. uk).
177    
178          if (TldPatterns.EXCLUDED.contains(ancestorName)) {
179            return i + 1;
180          }
181    
182          if (matchesWildcardPublicSuffix(ancestorName)) {
183            return i;
184          }
185        }
186    
187        return NO_PUBLIC_SUFFIX_FOUND;
188      }
189    
190      /**
191       * A deprecated synonym for {@link #from(String)}.
192       *
193       * @param domain A domain name (not IP address)
194       * @throws IllegalArgumentException if {@code name} is not syntactically valid
195       *     according to {@link #isValidLenient}
196       * @since 8.0 (previously named {@code from})
197       * @deprecated Use {@link #from(String)}
198       */
199      @Deprecated
200      public static InternetDomainName fromLenient(String domain) {
201        return from(domain);
202      }
203    
204      /**
205       * Returns an instance of {@link InternetDomainName} after lenient
206       * validation.  Specifically, validation against <a
207       * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
208       * ("Internationalizing Domain Names in Applications") is skipped, while
209       * validation against <a
210       * href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a> is relaxed in
211       * the following ways:
212       * <ul>
213       * <li>Any part containing non-ASCII characters is considered valid.
214       * <li>Underscores ('_') are permitted wherever dashes ('-') are permitted.
215       * <li>Parts other than the final part may start with a digit.
216       * </ul>
217       *
218       *
219       * @param domain A domain name (not IP address)
220       * @throws IllegalArgumentException if {@code name} is not syntactically valid
221       *     according to {@link #isValid}
222       * @since 10.0 (previously named {@code fromLenient})
223       */
224      public static InternetDomainName from(String domain) {
225        return new InternetDomainName(checkNotNull(domain));
226      }
227    
228      /**
229       * Validation method used by {@from} to ensure that the domain name is
230       * syntactically valid according to RFC 1035.
231       *
232       * @return Is the domain name syntactically valid?
233       */
234      private static boolean validateSyntax(List<String> parts) {
235        final int lastIndex = parts.size() - 1;
236    
237        // Validate the last part specially, as it has different syntax rules.
238    
239        if (!validatePart(parts.get(lastIndex), true)) {
240          return false;
241        }
242    
243        for (int i = 0; i < lastIndex; i++) {
244          String part = parts.get(i);
245          if (!validatePart(part, false)) {
246            return false;
247          }
248        }
249    
250        return true;
251      }
252    
253      private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_");
254    
255      private static final CharMatcher PART_CHAR_MATCHER =
256          CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER);
257    
258      /**
259       * Helper method for {@link #validateSyntax(List)}. Validates that one part of
260       * a domain name is valid.
261       *
262       * @param part The domain name part to be validated
263       * @param isFinalPart Is this the final (rightmost) domain part?
264       * @return Whether the part is valid
265       */
266      private static boolean validatePart(String part, boolean isFinalPart) {
267    
268        // These tests could be collapsed into one big boolean expression, but
269        // they have been left as independent tests for clarity.
270    
271        if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) {
272          return false;
273        }
274    
275        /*
276         * GWT claims to support java.lang.Character's char-classification methods,
277         * but it actually only works for ASCII. So for now, assume any non-ASCII
278         * characters are valid. The only place this seems to be documented is here:
279         * http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html
280         *
281         * <p>ASCII characters in the part are expected to be valid per RFC 1035,
282         * with underscore also being allowed due to widespread practice.
283         */
284    
285        String asciiChars = CharMatcher.ASCII.retainFrom(part);
286    
287        if (!PART_CHAR_MATCHER.matchesAllOf(asciiChars)) {
288          return false;
289        }
290    
291        // No initial or final dashes or underscores.
292    
293        if (DASH_MATCHER.matches(part.charAt(0))
294            || DASH_MATCHER.matches(part.charAt(part.length() - 1))) {
295          return false;
296        }
297    
298        /*
299         * Note that we allow (in contravention of a strict interpretation of the
300         * relevant RFCs) domain parts other than the last may begin with a digit
301         * (for example, "3com.com"). It's important to disallow an initial digit in
302         * the last part; it's the only thing that stops an IPv4 numeric address
303         * like 127.0.0.1 from looking like a valid domain name.
304         */
305    
306        if (isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) {
307          return false;
308        }
309    
310        return true;
311      }
312    
313      /**
314       * Returns the domain name, normalized to all lower case.
315       */
316      public String name() {
317        return name;
318      }
319    
320      /**
321       * Returns the individual components of this domain name, normalized to all
322       * lower case. For example, for the domain name {@code mail.google.com}, this
323       * method returns the list {@code ["mail", "google", "com"]}.
324       */
325      public ImmutableList<String> parts() {
326        return parts;
327      }
328    
329      /**
330       * Indicates whether this domain name represents a <i>public suffix</i>, as
331       * defined by the Mozilla Foundation's
332       * <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public
333       * suffix is one under which Internet users can directly register names, such
334       * as {@code com}, {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain
335       * names that are <i>not</i> public suffixes include {@code google}, {@code
336       * google.com} and {@code foo.co.uk}.
337       *
338       * @return {@code true} if this domain name appears exactly on the public
339       *     suffix list
340       * @since 6.0
341       */
342      public boolean isPublicSuffix() {
343        return publicSuffixIndex == 0;
344      }
345    
346      /**
347       * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
348       * public suffix}, including if it is a public suffix itself. For example,
349       * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
350       * {@code com}, but not for {@code google} or {@code google.foo}. This is
351       * the recommended method for determining whether a domain is potentially an
352       * addressable host.
353       *
354       * @since 6.0
355       */
356      public boolean hasPublicSuffix() {
357        return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND;
358      }
359    
360      /**
361       * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the
362       * domain name, or {@code null} if no public suffix is present.
363       *
364       * @since 6.0
365       */
366      public InternetDomainName publicSuffix() {
367        return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null;
368      }
369    
370      /**
371       * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
372       * public suffix}, while not being a public suffix itself. For example,
373       * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
374       * {@code bar.ca.us}, but not for {@code google}, {@code com}, or {@code
375       * google.foo}.
376       *
377       * <p><b>Warning:</b> a {@code false} result from this method does not imply
378       * that the domain does not represent an addressable host, as many public
379       * suffixes are also addressable hosts. Use {@link #hasPublicSuffix()} for
380       * that test.
381       *
382       * <p>This method can be used to determine whether it will probably be
383       * possible to set cookies on the domain, though even that depends on
384       * individual browsers' implementations of cookie controls. See
385       * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
386       *
387       * @since 6.0
388       */
389      public boolean isUnderPublicSuffix() {
390        return publicSuffixIndex > 0;
391      }
392    
393      /**
394       * Indicates whether this domain name is composed of exactly one subdomain
395       * component followed by a {@linkplain #isPublicSuffix() public suffix}. For
396       * example, returns {@code true} for {@code google.com} and {@code foo.co.uk},
397       * but not for {@code www.google.com} or {@code co.uk}.
398       *
399       * <p><b>Warning:</b> A {@code true} result from this method does not imply
400       * that the domain is at the highest level which is addressable as a host, as
401       * many public suffixes are also addressable hosts. For example, the domain
402       * {@code bar.uk.com} has a public suffix of {@code uk.com}, so it would
403       * return {@code true} from this method. But {@code uk.com} is itself an
404       * addressable host.
405       *
406       * <p>This method can be used to determine whether a domain is probably the
407       * highest level for which cookies may be set, though even that depends on
408       * individual browsers' implementations of cookie controls. See
409       * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
410       *
411       * @since 6.0
412       */
413      public boolean isTopPrivateDomain() {
414        return publicSuffixIndex == 1;
415      }
416    
417      /**
418       * Returns the portion of this domain name that is one level beneath the
419       * public suffix. For example, for {@code x.adwords.google.co.uk} it returns
420       * {@code google.co.uk}, since {@code co.uk} is a public suffix.
421       *
422       * <p>If {@link #isTopPrivateDomain()} is true, the current domain name
423       * instance is returned.
424       *
425       * <p>This method should not be used to determine the topmost parent domain
426       * which is addressable as a host, as many public suffixes are also
427       * addressable hosts. For example, the domain {@code foo.bar.uk.com} has
428       * a public suffix of {@code uk.com}, so it would return {@code bar.uk.com}
429       * from this method. But {@code uk.com} is itself an addressable host.
430       *
431       * <p>This method can be used to determine the probable highest level parent
432       * domain for which cookies may be set, though even that depends on individual
433       * browsers' implementations of cookie controls.
434       *
435       * @throws IllegalStateException if this domain does not end with a
436       *     public suffix
437       * @since 6.0
438       */
439      public InternetDomainName topPrivateDomain() {
440        if (isTopPrivateDomain()) {
441          return this;
442        }
443        checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name);
444        return ancestor(publicSuffixIndex - 1);
445      }
446    
447      /**
448       * Indicates whether this domain is composed of two or more parts.
449       */
450      public boolean hasParent() {
451        return parts.size() > 1;
452      }
453    
454      /**
455       * Returns an {@code InternetDomainName} that is the immediate ancestor of
456       * this one; that is, the current domain with the leftmost part removed. For
457       * example, the parent of {@code www.google.com} is {@code google.com}.
458       *
459       * @throws IllegalStateException if the domain has no parent, as determined
460       *     by {@link #hasParent}
461       */
462      public InternetDomainName parent() {
463        checkState(hasParent(), "Domain '%s' has no parent", name);
464        return ancestor(1);
465      }
466    
467      /**
468       * Returns the ancestor of the current domain at the given number of levels
469       * "higher" (rightward) in the subdomain list. The number of levels must be
470       * non-negative, and less than {@code N-1}, where {@code N} is the number of
471       * parts in the domain.
472       *
473       * <p>TODO: Reasonable candidate for addition to public API.
474       */
475      private InternetDomainName ancestor(int levels) {
476        return from(DOT_JOINER.join(parts.subList(levels, parts.size())));
477      }
478    
479      /**
480       * Creates and returns a new {@code InternetDomainName} by prepending the
481       * argument and a dot to the current name. For example, {@code
482       * InternetDomainName.from("foo.com").child("www.bar")} returns a new
483       * {@code InternetDomainName} with the value {@code www.bar.foo.com}. Only
484       * lenient validation is performed, as described {@link #from(String) here}.
485       *
486       * @throws NullPointerException if leftParts is null
487       * @throws IllegalArgumentException if the resulting name is not valid
488       */
489      public InternetDomainName child(String leftParts) {
490        return from(checkNotNull(leftParts) + "." + name);
491      }
492    
493      /**
494       * A deprecated synonym for {@link #isValid(String)}.
495       *
496       * @since 8.0 (previously named {@code isValid})
497       * @deprecated Use {@link #isValid(String)} instead
498       */
499      @Deprecated
500      public static boolean isValidLenient(String name) {
501        return isValid(name);
502      }
503    
504      /**
505       * Indicates whether the argument is a syntactically valid domain name using
506       * lenient validation. Specifically, validation against <a
507       * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
508       * ("Internationalizing Domain Names in Applications") is skipped.
509       *
510       * <p>The following two code snippets are equivalent:
511       *
512       * <pre>   {@code
513       *
514       *   domainName = InternetDomainName.isValid(name)
515       *       ? InternetDomainName.from(name)
516       *       : DEFAULT_DOMAIN;
517       *   }</pre>
518       *
519       * <pre>   {@code
520       *
521       *   try {
522       *     domainName = InternetDomainName.from(name);
523       *   } catch (IllegalArgumentException e) {
524       *     domainName = DEFAULT_DOMAIN;
525       *   }}</pre>
526       *
527       * @since 8.0 (previously named {@code isValidLenient})
528       */
529      public static boolean isValid(String name) {
530        try {
531          from(name);
532          return true;
533        } catch (IllegalArgumentException e) {
534          return false;
535        }
536      }
537    
538      /**
539       * Does the domain name match one of the "wildcard" patterns (e.g.
540       * {@code "*.ar"})?
541       */
542      private static boolean matchesWildcardPublicSuffix(String domain) {
543        final String[] pieces = domain.split(DOT_REGEX, 2);
544        return pieces.length == 2 && TldPatterns.UNDER.contains(pieces[1]);
545      }
546    
547      // TODO: specify this to return the same as name(); remove name()
548      @Override
549      public String toString() {
550        return Objects.toStringHelper(this).add("name", name).toString();
551      }
552    
553      /**
554       * Equality testing is based on the text supplied by the caller,
555       * after normalization as described in the class documentation. For
556       * example, a non-ASCII Unicode domain name and the Punycode version
557       * of the same domain name would not be considered equal.
558       *
559       */
560      @Override
561      public boolean equals(@Nullable Object object) {
562        if (object == this) {
563          return true;
564        }
565    
566        if (object instanceof InternetDomainName) {
567          InternetDomainName that = (InternetDomainName) object;
568          return this.name.equals(that.name);
569        }
570    
571        return false;
572      }
573    
574      @Override
575      public int hashCode() {
576        return name.hashCode();
577      }
578    }