001    /*
002     * Copyright (C) 2009 The Guava Authors
003     *
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     *
008     * http://www.apache.org/licenses/LICENSE-2.0
009     *
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package com.google.common.net;
018    
019    import static com.google.common.base.Preconditions.checkArgument;
020    import static com.google.common.base.Preconditions.checkNotNull;
021    import static com.google.common.base.Preconditions.checkState;
022    
023    import com.google.common.annotations.Beta;
024    import com.google.common.annotations.GwtCompatible;
025    import com.google.common.base.Ascii;
026    import com.google.common.base.CharMatcher;
027    import com.google.common.base.Joiner;
028    import com.google.common.base.Objects;
029    import com.google.common.base.Splitter;
030    import com.google.common.collect.ImmutableList;
031    
032    import java.util.List;
033    
034    import javax.annotation.Nullable;
035    
036    /**
037     * An immutable well-formed internet domain name, such as {@code com} or {@code
038     * foo.co.uk}. Only syntactic analysis is performed; no DNS lookups or other
039     * network interactions take place. Thus there is no guarantee that the domain
040     * actually exists on the internet.
041     *
042     * <p>One common use of this class is to determine whether a given string is
043     * likely to represent an addressable domain on the web -- that is, for a
044     * candidate string {@code "xxx"}, might browsing to {@code "http://xxx/"}
045     * result in a webpage being displayed? In the past, this test was frequently
046     * done by determining whether the domain ended with a {@linkplain
047     * #isPublicSuffix() public suffix} but was not itself a public suffix. However,
048     * this test is no longer accurate. There are many domains which are both public
049     * suffixes and addressable as hosts; {@code "uk.com"} is one example. As a
050     * result, the only useful test to determine if a domain is a plausible web host
051     * is {@link #hasPublicSuffix()}. This will return {@code true} for many domains
052     * which (currently) are not hosts, such as {@code "com"}), but given that any
053     * public suffix may become a host without warning, it is better to err on the
054     * side of permissiveness and thus avoid spurious rejection of valid sites.
055     *
056     * <p>During construction, names are normalized in two ways:
057     * <ol>
058     * <li>ASCII uppercase characters are converted to lowercase.
059     * <li>Unicode dot separators other than the ASCII period ({@code '.'}) are
060     * converted to the ASCII period.
061     * </ol>
062     * The normalized values will be returned from {@link #name()} and
063     * {@link #parts()}, and will be reflected in the result of
064     * {@link #equals(Object)}.
065     *
066     * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name">
067     * internationalized domain names</a> such as {@code 网络.cn} are supported, as
068     * are the equivalent <a
069     * href="http://en.wikipedia.org/wiki/Internationalized_domain_name">IDNA
070     * Punycode-encoded</a> versions.
071     *
072     * @author Craig Berry
073     * @since 5
074     */
075    @Beta
076    @GwtCompatible(emulated = true)
077    public class InternetDomainName {
078    
079      private static final CharMatcher DOTS_MATCHER =
080          CharMatcher.anyOf(".\u3002\uFF0E\uFF61");
081      private static final Splitter DOT_SPLITTER = Splitter.on('.');
082      private static final Joiner DOT_JOINER = Joiner.on('.');
083    
084      /**
085       * Value of {@link #publicSuffixIndex} which indicates that no public suffix
086       * was found.
087       */
088      private static final int NO_PUBLIC_SUFFIX_FOUND = -1;
089    
090      private static final String DOT_REGEX = "\\.";
091    
092      /**
093       * Maximum parts (labels) in a domain name. This value arises from
094       * the 255-octet limit described in
095       * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11 with
096       * the fact that the encoding of each part occupies at least two bytes
097       * (dot plus label externally, length byte plus label internally). Thus, if
098       * all labels have the minimum size of one byte, 127 of them will fit.
099       */
100      private static final int MAX_PARTS = 127;
101    
102      /**
103       * Maximum length of a full domain name, including separators, and
104       * leaving room for the root label. See
105       * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
106       */
107      private static final int MAX_LENGTH = 253;
108    
109      /**
110       * Maximum size of a single part of a domain name. See
111       * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
112       */
113      private static final int MAX_DOMAIN_PART_LENGTH = 63;
114    
115      /**
116       * The full domain name, converted to lower case.
117       */
118      private final String name;
119    
120      /**
121       * The parts of the domain name, converted to lower case.
122       */
123      private final ImmutableList<String> parts;
124    
125      /**
126       * The index in the {@link #parts()} list at which the public suffix begins.
127       * For example, for the domain name {@code www.google.co.uk}, the value would
128       * be 2 (the index of the {@code co} part). The value is negative
129       * (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was
130       * found.
131       */
132      private final int publicSuffixIndex;
133    
134      /**
135       * Private constructor used to implement {@link #fromLenient(String)}.
136       */
137      private InternetDomainName(String name) {
138        // Normalize:
139        // * ASCII characters to lowercase
140        // * All dot-like characters to '.'
141        // * Strip trailing '.'
142    
143        name = Ascii.toLowerCase(DOTS_MATCHER.replaceFrom(name, '.'));
144    
145        if (name.endsWith(".")) {
146          name = name.substring(0, name.length() - 1);
147        }
148    
149        checkArgument(name.length() <= MAX_LENGTH, "Domain name too long: '%s':", name);
150        this.name = name;
151    
152        this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name));
153        checkArgument(parts.size() <= MAX_PARTS, "Domain has too many parts: '%s'", name);
154        checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name);
155    
156        this.publicSuffixIndex = findPublicSuffix();
157      }
158    
159      /**
160       * Returns the index of the leftmost part of the public suffix, or -1 if not
161       * found. Note that the value defined as the "public suffix" may not be a
162       * public suffix according to {@link #isPublicSuffix()} if the domain ends
163       * with an excluded domain pattern such as {@code "nhs.uk"}.
164       */
165      private int findPublicSuffix() {
166        final int partsSize = parts.size();
167    
168        for (int i = 0; i < partsSize; i++) {
169          String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize));
170    
171          if (TldPatterns.EXACT.contains(ancestorName)) {
172            return i;
173          }
174    
175          // Excluded domains (e.g. !nhs.uk) use the next highest
176          // domain as the effective public suffix (e.g. uk).
177    
178          if (TldPatterns.EXCLUDED.contains(ancestorName)) {
179            return i + 1;
180          }
181    
182          if (matchesWildcardPublicSuffix(ancestorName)) {
183            return i;
184          }
185        }
186    
187        return NO_PUBLIC_SUFFIX_FOUND;
188      }
189    
190      /**
191       * Returns an instance of {@link InternetDomainName} after lenient
192       * validation.  Specifically, validation against <a
193       * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
194       * ("Internationalizing Domain Names in Applications") is skipped, while
195       * validation against <a
196       * href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a> is relaxed in
197       * the following ways:
198       * <ul>
199       * <li>Any part containing non-ASCII characters is considered valid.
200       * <li>Underscores ('_') are permitted wherever dashes ('-') are permitted.
201       * <li>Parts other than the final part may start with a digit.
202       * </ul>
203       *
204       * @param domain A domain name (not IP address)
205       * @throws IllegalArgumentException if {@code name} is not syntactically valid
206       *     according to {@link #isValidLenient}
207       * @since 8 (previously named {@code from})
208       */
209      public static InternetDomainName fromLenient(String domain) {
210        return new InternetDomainName(checkNotNull(domain));
211      }
212    
213      /**
214       * Validation method used by {@from} to ensure that the domain name is
215       * syntactically valid according to RFC 1035.
216       *
217       * @return Is the domain name syntactically valid?
218       */
219      private static boolean validateSyntax(List<String> parts) {
220        final int lastIndex = parts.size() - 1;
221    
222        // Validate the last part specially, as it has different syntax rules.
223    
224        if (!validatePart(parts.get(lastIndex), true)) {
225          return false;
226        }
227    
228        for (int i = 0; i < lastIndex; i++) {
229          String part = parts.get(i);
230          if (!validatePart(part, false)) {
231            return false;
232          }
233        }
234    
235        return true;
236      }
237    
238      private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_");
239    
240      private static final CharMatcher PART_CHAR_MATCHER =
241          CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER);
242    
243      /**
244       * Helper method for {@link #validateSyntax(List)}. Validates that one part of
245       * a domain name is valid.
246       *
247       * @param part The domain name part to be validated
248       * @param isFinalPart Is this the final (rightmost) domain part?
249       * @return Whether the part is valid
250       */
251      private static boolean validatePart(String part, boolean isFinalPart) {
252    
253        // These tests could be collapsed into one big boolean expression, but
254        // they have been left as independent tests for clarity.
255    
256        if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) {
257          return false;
258        }
259    
260        /*
261         * GWT claims to support java.lang.Character's char-classification methods,
262         * but it actually only works for ASCII. So for now, assume any non-ASCII
263         * characters are valid. The only place this seems to be documented is here:
264         * http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html
265         *
266         * <p>ASCII characters in the part are expected to be valid per RFC 1035,
267         * with underscore also being allowed due to widespread practice.
268         */
269    
270        String asciiChars = CharMatcher.ASCII.retainFrom(part);
271    
272        if (!PART_CHAR_MATCHER.matchesAllOf(asciiChars)) {
273          return false;
274        }
275    
276        // No initial or final dashes or underscores.
277    
278        if (DASH_MATCHER.matches(part.charAt(0))
279            || DASH_MATCHER.matches(part.charAt(part.length() - 1))) {
280          return false;
281        }
282    
283        /*
284         * Note that we allow (in contravention of a strict interpretation of the
285         * relevant RFCs) domain parts other than the last may begin with a digit
286         * (for example, "3com.com"). It's important to disallow an initial digit in
287         * the last part; it's the only thing that stops an IPv4 numeric address
288         * like 127.0.0.1 from looking like a valid domain name.
289         */
290    
291        if (isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) {
292          return false;
293        }
294    
295        return true;
296      }
297    
298      /**
299       * Returns the domain name, normalized to all lower case.
300       */
301      public String name() {
302        return name;
303      }
304    
305      /**
306       * Returns the individual components of this domain name, normalized to all
307       * lower case. For example, for the domain name {@code mail.google.com}, this
308       * method returns the list {@code ["mail", "google", "com"]}.
309       */
310      public ImmutableList<String> parts() {
311        return parts;
312      }
313    
314      /**
315       * Indicates whether this domain name represents a <i>public suffix</i>, as
316       * defined by the Mozilla Foundation's
317       * <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public
318       * suffix is one under which Internet users can directly register names, such
319       * as {@code com}, {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain
320       * names that are <i>not</i> public suffixes include {@code google}, {@code
321       * google.com} and {@code foo.co.uk}.
322       *
323       * @return {@code true} if this domain name appears exactly on the public
324       *     suffix list
325       * @since 6
326       */
327      public boolean isPublicSuffix() {
328        return publicSuffixIndex == 0;
329      }
330    
331      /**
332       * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
333       * public suffix}, including if it is a public suffix itself. For example,
334       * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
335       * {@code com}, but not for {@code google} or {@code google.foo}. This is
336       * the recommended method for determining whether a domain is potentially an
337       * addressable host.
338       *
339       * @since 6
340       */
341      public boolean hasPublicSuffix() {
342        return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND;
343      }
344    
345      /**
346       * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the
347       * domain name, or {@code null} if no public suffix is present.
348       *
349       * @since 6
350       */
351      public InternetDomainName publicSuffix() {
352        return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null;
353      }
354    
355      /**
356       * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
357       * public suffix}, while not being a public suffix itself. For example,
358       * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
359       * {@code bar.ca.us}, but not for {@code google}, {@code com}, or {@code
360       * google.foo}.
361       *
362       * <p><b>Warning:</b> a {@code false} result from this method does not imply
363       * that the domain does not represent an addressable host, as many public
364       * suffixes are also addressable hosts. Use {@link #hasPublicSuffix()} for
365       * that test.
366       *
367       * <p>This method can be used to determine whether it will probably be
368       * possible to set cookies on the domain, though even that depends on
369       * individual browsers' implementations of cookie controls. See
370       * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
371       *
372       * @since 6
373       */
374      public boolean isUnderPublicSuffix() {
375        return publicSuffixIndex > 0;
376      }
377    
378      /**
379       * Indicates whether this domain name is composed of exactly one subdomain
380       * component followed by a {@linkplain #isPublicSuffix() public suffix}. For
381       * example, returns {@code true} for {@code google.com} and {@code foo.co.uk},
382       * but not for {@code www.google.com} or {@code co.uk}.
383       *
384       * <p><b>Warning:</b> A {@code true} result from this method does not imply
385       * that the domain is at the highest level which is addressable as a host, as
386       * many public suffixes are also addressable hosts. For example, the domain
387       * {@code bar.uk.com} has a public suffix of {@code uk.com}, so it would
388       * return {@code true} from this method. But {@code uk.com} is itself an
389       * addressable host.
390       *
391       * <p>This method can be used to determine whether a domain is probably the
392       * highest level for which cookies may be set, though even that depends on
393       * individual browsers' implementations of cookie controls. See
394       * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
395       *
396       * @since 6
397       */
398      public boolean isTopPrivateDomain() {
399        return publicSuffixIndex == 1;
400      }
401    
402      /**
403       * Returns the portion of this domain name that is one level beneath the
404       * public suffix. For example, for {@code x.adwords.google.co.uk} it returns
405       * {@code google.co.uk}, since {@code co.uk} is a public suffix.
406       *
407       * <p>If {@link #isTopPrivateDomain()} is true, the current domain name
408       * instance is returned.
409       *
410       * <p>This method should not be used to determine the topmost parent domain
411       * which is addressable as a host, as many public suffixes are also
412       * addressable hosts. For example, the domain {@code foo.bar.uk.com} has
413       * a public suffix of {@code uk.com}, so it would return {@code bar.uk.com}
414       * from this method. But {@code uk.com} is itself an addressable host.
415       *
416       * <p>This method can be used to determine the probable highest level parent
417       * domain for which cookies may be set, though even that depends on individual
418       * browsers' implementations of cookie controls.
419       *
420       * @throws IllegalStateException if this domain does not end with a
421       *     public suffix
422       * @since 6
423       */
424      public InternetDomainName topPrivateDomain() {
425        if (isTopPrivateDomain()) {
426          return this;
427        }
428        checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name);
429        return ancestor(publicSuffixIndex - 1);
430      }
431    
432      /**
433       * Indicates whether this domain is composed of two or more parts.
434       */
435      public boolean hasParent() {
436        return parts.size() > 1;
437      }
438    
439      /**
440       * Returns an {@code InternetDomainName} that is the immediate ancestor of
441       * this one; that is, the current domain with the leftmost part removed. For
442       * example, the parent of {@code www.google.com} is {@code google.com}.
443       *
444       * @throws IllegalStateException if the domain has no parent, as determined
445       *     by {@link #hasParent}
446       */
447      public InternetDomainName parent() {
448        checkState(hasParent(), "Domain '%s' has no parent", name);
449        return ancestor(1);
450      }
451    
452      /**
453       * Returns the ancestor of the current domain at the given number of levels
454       * "higher" (rightward) in the subdomain list. The number of levels must be
455       * non-negative, and less than {@code N-1}, where {@code N} is the number of
456       * parts in the domain.
457       *
458       * <p>TODO: Reasonable candidate for addition to public API.
459       */
460      private InternetDomainName ancestor(int levels) {
461        return fromInternal(DOT_JOINER.join(parts.subList(levels, parts.size())));
462      }
463    
464      /**
465       * Creates and returns a new {@code InternetDomainName} by prepending the
466       * argument and a dot to the current name. For example, {@code
467       * InternetDomainName.fromLenient("foo.com").child("www.bar")} returns a new
468       * {@code InternetDomainName} with the value {@code www.bar.foo.com}.
469       *
470       * @throws NullPointerException if leftParts is null
471       * @throws IllegalArgumentException if the resulting name is not valid
472       */
473      public InternetDomainName child(String leftParts) {
474        return fromInternal(checkNotNull(leftParts) + "." + name);
475      }
476    
477      /**
478       * Returns a new {@link InternetDomainName} instance with the given {@code
479       * name}, using the same validation as the instance on which it is called.
480       */
481      InternetDomainName fromInternal(String name) {
482        return fromLenient(name);
483      }
484    
485      /**
486       * Indicates whether the argument is a syntactically valid domain name after
487       * lenient validation. Specifically, validation against <a
488       * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
489       * ("Internationalizing Domain Names in Applications") is skipped.
490       *
491       * <p>The follow two code snippets are equivalent:
492       *
493       * <pre>   {@code
494       *
495       *   if (InternetDomainName.isValidLenient(name)) {
496       *     domainName = InternetDomainName.fromLenient(name);
497       *   } else {
498       *     domainName = DEFAULT_DOMAIN;
499       *   }}</pre>
500       *
501       * <pre>   {@code
502       *
503       *   try {
504       *     domainName = InternetDomainName.fromLenient(name);
505       *   } catch (IllegalArgumentException e) {
506       *     domainName = DEFAULT_DOMAIN;
507       *   }}</pre>
508       *
509       * @since 8 (previously named {@code isValid})
510       */
511      public static boolean isValidLenient(String name) {
512        try {
513          fromLenient(name);
514          return true;
515        } catch (IllegalArgumentException e) {
516          return false;
517        }
518      }
519    
520      /**
521       * Does the domain name match one of the "wildcard" patterns (e.g.
522       * {@code "*.ar"})?
523       */
524      private static boolean matchesWildcardPublicSuffix(String domain) {
525        final String[] pieces = domain.split(DOT_REGEX, 2);
526        return pieces.length == 2 && TldPatterns.UNDER.contains(pieces[1]);
527      }
528    
529      // TODO: specify this to return the same as name(); remove name()
530      @Override
531      public String toString() {
532        return Objects.toStringHelper(this).add("name", name).toString();
533      }
534    
535      @Override
536      public boolean equals(@Nullable Object object) {
537        if (object == this) {
538          return true;
539        }
540    
541        if (object instanceof InternetDomainName) {
542          InternetDomainName that = (InternetDomainName) object;
543          return this.name.equals(that.name);
544        }
545    
546        return false;
547      }
548    
549      @Override
550      public int hashCode() {
551        return name.hashCode();
552      }
553    }