001    /*
002     * Copyright (C) 2009 The Guava Authors
003     *
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     *
008     * http://www.apache.org/licenses/LICENSE-2.0
009     *
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package com.google.common.net;
018    
019    import static com.google.common.base.Preconditions.checkArgument;
020    import static com.google.common.base.Preconditions.checkNotNull;
021    import static com.google.common.base.Preconditions.checkState;
022    
023    import com.google.common.annotations.Beta;
024    import com.google.common.annotations.GwtCompatible;
025    import com.google.common.base.Ascii;
026    import com.google.common.base.CharMatcher;
027    import com.google.common.base.Joiner;
028    import com.google.common.base.Objects;
029    import com.google.common.base.Splitter;
030    import com.google.common.collect.ImmutableList;
031    
032    import java.util.List;
033    
034    import javax.annotation.Nullable;
035    
036    /**
037     * An immutable well-formed internet domain name, such as {@code com} or {@code
038     * foo.co.uk}. Only syntactic analysis is performed; no DNS lookups or other
039     * network interactions take place. Thus there is no guarantee that the domain
040     * actually exists on the internet.
041     *
042     * <p>One common use of this class is to determine whether a given string is
043     * likely to represent an addressable domain on the web -- that is, for a
044     * candidate string {@code "xxx"}, might browsing to {@code "http://xxx/"}
045     * result in a webpage being displayed? In the past, this test was frequently
046     * done by determining whether the domain ended with a {@linkplain
047     * #isPublicSuffix() public suffix} but was not itself a public suffix. However,
048     * this test is no longer accurate. There are many domains which are both public
049     * suffixes and addressable as hosts; {@code "uk.com"} is one example. As a
050     * result, the only useful test to determine if a domain is a plausible web host
051     * is {@link #hasPublicSuffix()}. This will return {@code true} for many domains
052     * which (currently) are not hosts, such as {@code "com"}), but given that any
053     * public suffix may become a host without warning, it is better to err on the
054     * side of permissiveness and thus avoid spurious rejection of valid sites.
055     *
056     * <p>During construction, names are normalized in two ways:
057     * <ol>
058     * <li>ASCII uppercase characters are converted to lowercase.
059     * <li>Unicode dot separators other than the ASCII period ({@code '.'}) are
060     * converted to the ASCII period.
061     * </ol>
062     * The normalized values will be returned from {@link #name()} and
063     * {@link #parts()}, and will be reflected in the result of
064     * {@link #equals(Object)}.
065     *
066     * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name">
067     * internationalized domain names</a> such as {@code 网络.cn} are supported, as
068     * are the equivalent <a
069     * href="http://en.wikipedia.org/wiki/Internationalized_domain_name">IDNA
070     * Punycode-encoded</a> versions.
071     *
072     * @author Craig Berry
073     * @since 5.0
074     */
075    @Beta
076    @GwtCompatible(emulated = true)
077    public final class InternetDomainName {
078    
079      private static final CharMatcher DOTS_MATCHER =
080          CharMatcher.anyOf(".\u3002\uFF0E\uFF61");
081      private static final Splitter DOT_SPLITTER = Splitter.on('.');
082      private static final Joiner DOT_JOINER = Joiner.on('.');
083    
084      /**
085       * Value of {@link #publicSuffixIndex} which indicates that no public suffix
086       * was found.
087       */
088      private static final int NO_PUBLIC_SUFFIX_FOUND = -1;
089    
090      private static final String DOT_REGEX = "\\.";
091    
092      /**
093       * Maximum parts (labels) in a domain name. This value arises from
094       * the 255-octet limit described in
095       * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11 with
096       * the fact that the encoding of each part occupies at least two bytes
097       * (dot plus label externally, length byte plus label internally). Thus, if
098       * all labels have the minimum size of one byte, 127 of them will fit.
099       */
100      private static final int MAX_PARTS = 127;
101    
102      /**
103       * Maximum length of a full domain name, including separators, and
104       * leaving room for the root label. See
105       * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
106       */
107      private static final int MAX_LENGTH = 253;
108    
109      /**
110       * Maximum size of a single part of a domain name. See
111       * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
112       */
113      private static final int MAX_DOMAIN_PART_LENGTH = 63;
114    
115      /**
116       * The full domain name, converted to lower case.
117       */
118      private final String name;
119    
120      /**
121       * The parts of the domain name, converted to lower case.
122       */
123      private final ImmutableList<String> parts;
124    
125      /**
126       * The index in the {@link #parts()} list at which the public suffix begins.
127       * For example, for the domain name {@code www.google.co.uk}, the value would
128       * be 2 (the index of the {@code co} part). The value is negative
129       * (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was
130       * found.
131       */
132      private final int publicSuffixIndex;
133    
134      /**
135       * Constructor used to implement {@link #from(String)}, and from subclasses.
136       */
137      InternetDomainName(String name) {
138        // Normalize:
139        // * ASCII characters to lowercase
140        // * All dot-like characters to '.'
141        // * Strip trailing '.'
142    
143        name = Ascii.toLowerCase(DOTS_MATCHER.replaceFrom(name, '.'));
144    
145        if (name.endsWith(".")) {
146          name = name.substring(0, name.length() - 1);
147        }
148    
149        checkArgument(name.length() <= MAX_LENGTH,
150            "Domain name too long: '%s':", name);
151        this.name = name;
152    
153        this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name));
154        checkArgument(parts.size() <= MAX_PARTS,
155            "Domain has too many parts: '%s'", name);
156        checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name);
157    
158        this.publicSuffixIndex = findPublicSuffix();
159      }
160    
161      /**
162       * Returns the index of the leftmost part of the public suffix, or -1 if not
163       * found. Note that the value defined as the "public suffix" may not be a
164       * public suffix according to {@link #isPublicSuffix()} if the domain ends
165       * with an excluded domain pattern such as {@code "nhs.uk"}.
166       */
167      private int findPublicSuffix() {
168        final int partsSize = parts.size();
169    
170        for (int i = 0; i < partsSize; i++) {
171          String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize));
172    
173          if (TldPatterns.EXACT.contains(ancestorName)) {
174            return i;
175          }
176    
177          // Excluded domains (e.g. !nhs.uk) use the next highest
178          // domain as the effective public suffix (e.g. uk).
179    
180          if (TldPatterns.EXCLUDED.contains(ancestorName)) {
181            return i + 1;
182          }
183    
184          if (matchesWildcardPublicSuffix(ancestorName)) {
185            return i;
186          }
187        }
188    
189        return NO_PUBLIC_SUFFIX_FOUND;
190      }
191    
192      /**
193       * A deprecated synonym for {@link #from(String)}.
194       *
195       * @param domain A domain name (not IP address)
196       * @throws IllegalArgumentException if {@code name} is not syntactically valid
197       *     according to {@link #isValidLenient}
198       * @since 8.0 (previously named {@code from})
199       * @deprecated Use {@link #from(String)}
200       */
201      @Deprecated
202      public static InternetDomainName fromLenient(String domain) {
203        return from(domain);
204      }
205    
206      /**
207       * Returns an instance of {@link InternetDomainName} after lenient
208       * validation.  Specifically, validation against <a
209       * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
210       * ("Internationalizing Domain Names in Applications") is skipped, while
211       * validation against <a
212       * href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a> is relaxed in
213       * the following ways:
214       * <ul>
215       * <li>Any part containing non-ASCII characters is considered valid.
216       * <li>Underscores ('_') are permitted wherever dashes ('-') are permitted.
217       * <li>Parts other than the final part may start with a digit.
218       * </ul>
219       *
220       *
221       * @param domain A domain name (not IP address)
222       * @throws IllegalArgumentException if {@code name} is not syntactically valid
223       *     according to {@link #isValid}
224       * @since 10.0 (previously named {@code fromLenient})
225       */
226      public static InternetDomainName from(String domain) {
227        return new InternetDomainName(checkNotNull(domain));
228      }
229    
230      /**
231       * Validation method used by {@from} to ensure that the domain name is
232       * syntactically valid according to RFC 1035.
233       *
234       * @return Is the domain name syntactically valid?
235       */
236      private static boolean validateSyntax(List<String> parts) {
237        final int lastIndex = parts.size() - 1;
238    
239        // Validate the last part specially, as it has different syntax rules.
240    
241        if (!validatePart(parts.get(lastIndex), true)) {
242          return false;
243        }
244    
245        for (int i = 0; i < lastIndex; i++) {
246          String part = parts.get(i);
247          if (!validatePart(part, false)) {
248            return false;
249          }
250        }
251    
252        return true;
253      }
254    
255      private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_");
256    
257      private static final CharMatcher PART_CHAR_MATCHER =
258          CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER);
259    
260      /**
261       * Helper method for {@link #validateSyntax(List)}. Validates that one part of
262       * a domain name is valid.
263       *
264       * @param part The domain name part to be validated
265       * @param isFinalPart Is this the final (rightmost) domain part?
266       * @return Whether the part is valid
267       */
268      private static boolean validatePart(String part, boolean isFinalPart) {
269    
270        // These tests could be collapsed into one big boolean expression, but
271        // they have been left as independent tests for clarity.
272    
273        if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) {
274          return false;
275        }
276    
277        /*
278         * GWT claims to support java.lang.Character's char-classification methods,
279         * but it actually only works for ASCII. So for now, assume any non-ASCII
280         * characters are valid. The only place this seems to be documented is here:
281         * http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html
282         *
283         * <p>ASCII characters in the part are expected to be valid per RFC 1035,
284         * with underscore also being allowed due to widespread practice.
285         */
286    
287        String asciiChars = CharMatcher.ASCII.retainFrom(part);
288    
289        if (!PART_CHAR_MATCHER.matchesAllOf(asciiChars)) {
290          return false;
291        }
292    
293        // No initial or final dashes or underscores.
294    
295        if (DASH_MATCHER.matches(part.charAt(0))
296            || DASH_MATCHER.matches(part.charAt(part.length() - 1))) {
297          return false;
298        }
299    
300        /*
301         * Note that we allow (in contravention of a strict interpretation of the
302         * relevant RFCs) domain parts other than the last may begin with a digit
303         * (for example, "3com.com"). It's important to disallow an initial digit in
304         * the last part; it's the only thing that stops an IPv4 numeric address
305         * like 127.0.0.1 from looking like a valid domain name.
306         */
307    
308        if (isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) {
309          return false;
310        }
311    
312        return true;
313      }
314    
315      /**
316       * Returns the domain name, normalized to all lower case.
317       */
318      public String name() {
319        return name;
320      }
321    
322      /**
323       * Returns the individual components of this domain name, normalized to all
324       * lower case. For example, for the domain name {@code mail.google.com}, this
325       * method returns the list {@code ["mail", "google", "com"]}.
326       */
327      public ImmutableList<String> parts() {
328        return parts;
329      }
330    
331      /**
332       * Indicates whether this domain name represents a <i>public suffix</i>, as
333       * defined by the Mozilla Foundation's
334       * <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public
335       * suffix is one under which Internet users can directly register names, such
336       * as {@code com}, {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain
337       * names that are <i>not</i> public suffixes include {@code google}, {@code
338       * google.com} and {@code foo.co.uk}.
339       *
340       * @return {@code true} if this domain name appears exactly on the public
341       *     suffix list
342       * @since 6.0
343       */
344      public boolean isPublicSuffix() {
345        return publicSuffixIndex == 0;
346      }
347    
348      /**
349       * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
350       * public suffix}, including if it is a public suffix itself. For example,
351       * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
352       * {@code com}, but not for {@code google} or {@code google.foo}. This is
353       * the recommended method for determining whether a domain is potentially an
354       * addressable host.
355       *
356       * @since 6.0
357       */
358      public boolean hasPublicSuffix() {
359        return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND;
360      }
361    
362      /**
363       * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the
364       * domain name, or {@code null} if no public suffix is present.
365       *
366       * @since 6.0
367       */
368      public InternetDomainName publicSuffix() {
369        return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null;
370      }
371    
372      /**
373       * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
374       * public suffix}, while not being a public suffix itself. For example,
375       * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
376       * {@code bar.ca.us}, but not for {@code google}, {@code com}, or {@code
377       * google.foo}.
378       *
379       * <p><b>Warning:</b> a {@code false} result from this method does not imply
380       * that the domain does not represent an addressable host, as many public
381       * suffixes are also addressable hosts. Use {@link #hasPublicSuffix()} for
382       * that test.
383       *
384       * <p>This method can be used to determine whether it will probably be
385       * possible to set cookies on the domain, though even that depends on
386       * individual browsers' implementations of cookie controls. See
387       * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
388       *
389       * @since 6.0
390       */
391      public boolean isUnderPublicSuffix() {
392        return publicSuffixIndex > 0;
393      }
394    
395      /**
396       * Indicates whether this domain name is composed of exactly one subdomain
397       * component followed by a {@linkplain #isPublicSuffix() public suffix}. For
398       * example, returns {@code true} for {@code google.com} and {@code foo.co.uk},
399       * but not for {@code www.google.com} or {@code co.uk}.
400       *
401       * <p><b>Warning:</b> A {@code true} result from this method does not imply
402       * that the domain is at the highest level which is addressable as a host, as
403       * many public suffixes are also addressable hosts. For example, the domain
404       * {@code bar.uk.com} has a public suffix of {@code uk.com}, so it would
405       * return {@code true} from this method. But {@code uk.com} is itself an
406       * addressable host.
407       *
408       * <p>This method can be used to determine whether a domain is probably the
409       * highest level for which cookies may be set, though even that depends on
410       * individual browsers' implementations of cookie controls. See
411       * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
412       *
413       * @since 6.0
414       */
415      public boolean isTopPrivateDomain() {
416        return publicSuffixIndex == 1;
417      }
418    
419      /**
420       * Returns the portion of this domain name that is one level beneath the
421       * public suffix. For example, for {@code x.adwords.google.co.uk} it returns
422       * {@code google.co.uk}, since {@code co.uk} is a public suffix.
423       *
424       * <p>If {@link #isTopPrivateDomain()} is true, the current domain name
425       * instance is returned.
426       *
427       * <p>This method should not be used to determine the topmost parent domain
428       * which is addressable as a host, as many public suffixes are also
429       * addressable hosts. For example, the domain {@code foo.bar.uk.com} has
430       * a public suffix of {@code uk.com}, so it would return {@code bar.uk.com}
431       * from this method. But {@code uk.com} is itself an addressable host.
432       *
433       * <p>This method can be used to determine the probable highest level parent
434       * domain for which cookies may be set, though even that depends on individual
435       * browsers' implementations of cookie controls.
436       *
437       * @throws IllegalStateException if this domain does not end with a
438       *     public suffix
439       * @since 6.0
440       */
441      public InternetDomainName topPrivateDomain() {
442        if (isTopPrivateDomain()) {
443          return this;
444        }
445        checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name);
446        return ancestor(publicSuffixIndex - 1);
447      }
448    
449      /**
450       * Indicates whether this domain is composed of two or more parts.
451       */
452      public boolean hasParent() {
453        return parts.size() > 1;
454      }
455    
456      /**
457       * Returns an {@code InternetDomainName} that is the immediate ancestor of
458       * this one; that is, the current domain with the leftmost part removed. For
459       * example, the parent of {@code www.google.com} is {@code google.com}.
460       *
461       * @throws IllegalStateException if the domain has no parent, as determined
462       *     by {@link #hasParent}
463       */
464      public InternetDomainName parent() {
465        checkState(hasParent(), "Domain '%s' has no parent", name);
466        return ancestor(1);
467      }
468    
469      /**
470       * Returns the ancestor of the current domain at the given number of levels
471       * "higher" (rightward) in the subdomain list. The number of levels must be
472       * non-negative, and less than {@code N-1}, where {@code N} is the number of
473       * parts in the domain.
474       *
475       * <p>TODO: Reasonable candidate for addition to public API.
476       */
477      private InternetDomainName ancestor(int levels) {
478        return from(DOT_JOINER.join(parts.subList(levels, parts.size())));
479      }
480    
481      /**
482       * Creates and returns a new {@code InternetDomainName} by prepending the
483       * argument and a dot to the current name. For example, {@code
484       * InternetDomainName.from("foo.com").child("www.bar")} returns a new
485       * {@code InternetDomainName} with the value {@code www.bar.foo.com}. Only
486       * lenient validation is performed, as described {@link #from(String) here}.
487       *
488       * @throws NullPointerException if leftParts is null
489       * @throws IllegalArgumentException if the resulting name is not valid
490       */
491      public InternetDomainName child(String leftParts) {
492        return from(checkNotNull(leftParts) + "." + name);
493      }
494    
495      /**
496       * A deprecated synonym for {@link #isValid(String)}.
497       *
498       * @since 8.0 (previously named {@code isValid})
499       * @deprecated Use {@link #isValid(String)} instead
500       */
501      @Deprecated
502      public static boolean isValidLenient(String name) {
503        return isValid(name);
504      }
505    
506      /**
507       * Indicates whether the argument is a syntactically valid domain name using
508       * lenient validation. Specifically, validation against <a
509       * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
510       * ("Internationalizing Domain Names in Applications") is skipped.
511       *
512       * <p>The following two code snippets are equivalent:
513       *
514       * <pre>   {@code
515       *
516       *   domainName = InternetDomainName.isValid(name)
517       *       ? InternetDomainName.from(name)
518       *       : DEFAULT_DOMAIN;
519       *   }</pre>
520       *
521       * <pre>   {@code
522       *
523       *   try {
524       *     domainName = InternetDomainName.from(name);
525       *   } catch (IllegalArgumentException e) {
526       *     domainName = DEFAULT_DOMAIN;
527       *   }}</pre>
528       *
529       * @since 8.0 (previously named {@code isValidLenient})
530       */
531      public static boolean isValid(String name) {
532        try {
533          from(name);
534          return true;
535        } catch (IllegalArgumentException e) {
536          return false;
537        }
538      }
539    
540      /**
541       * Does the domain name match one of the "wildcard" patterns (e.g.
542       * {@code "*.ar"})?
543       */
544      private static boolean matchesWildcardPublicSuffix(String domain) {
545        final String[] pieces = domain.split(DOT_REGEX, 2);
546        return pieces.length == 2 && TldPatterns.UNDER.contains(pieces[1]);
547      }
548    
549      // TODO: specify this to return the same as name(); remove name()
550      @Override
551      public String toString() {
552        return Objects.toStringHelper(this).add("name", name).toString();
553      }
554    
555      /**
556       * Equality testing is based on the text supplied by the caller,
557       * after normalization as described in the class documentation. For
558       * example, a non-ASCII Unicode domain name and the Punycode version
559       * of the same domain name would not be considered equal.
560       *
561       */
562      @Override
563      public boolean equals(@Nullable Object object) {
564        if (object == this) {
565          return true;
566        }
567    
568        if (object instanceof InternetDomainName) {
569          InternetDomainName that = (InternetDomainName) object;
570          return this.name.equals(that.name);
571        }
572    
573        return false;
574      }
575    
576      @Override
577      public int hashCode() {
578        return name.hashCode();
579      }
580    }