001    /*
002     * Copyright (C) 2009 Google Inc.
003     *
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     *
008     * http://www.apache.org/licenses/LICENSE-2.0
009     *
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package com.google.common.net;
018    
019    import static com.google.common.base.Preconditions.checkArgument;
020    import static com.google.common.base.Preconditions.checkNotNull;
021    import static com.google.common.base.Preconditions.checkState;
022    
023    import com.google.common.annotations.Beta;
024    import com.google.common.annotations.GwtCompatible;
025    import com.google.common.base.Ascii;
026    import com.google.common.base.CharMatcher;
027    import com.google.common.base.Joiner;
028    import com.google.common.base.Objects;
029    import com.google.common.base.Splitter;
030    import com.google.common.collect.ImmutableList;
031    
032    import java.util.List;
033    
034    import javax.annotation.Nullable;
035    
036    /**
037     * An immutable well-formed internet domain name, as defined by
038     * <a href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a>.
039     * Examples include {@code com} and {@code foo.co.uk}. Only syntactic analysis
040     * is performed; no DNS lookups or other network interactions take place. Thus
041     * there is no guarantee that the domain actually exists on the internet.
042     * Invalid domain names throw {@link IllegalArgumentException} on construction.
043     *
044     * <p>One common use of this class is to determine whether a given string is
045     * likely to represent an addressable domain on the web -- that is, for a
046     * candidate string "xxx", might browsing to "http://xxx/" result in a webpage
047     * being displayed? In the past, this test was frequently done by determining
048     * whether the domain ended with a {@linkplain #isPublicSuffix() public suffix}
049     * but was not itself a public suffix. However, this test is no longer accurate;
050     * there are many domains which are both public suffixes and addressable as
051     * hosts. "uk.com" is one example. As a result, the only useful test to
052     * determine if a domain is a plausible web host is {@link #hasPublicSuffix()}.
053     * This will return {@code true} for many domains which (currently) are not
054     * hosts, such as "com"), but given that any public suffix may become
055     * a host without warning, it is better to err on the side of permissiveness
056     * and thus avoid spurious rejection of valid sites.
057     *
058     * <p>{@linkplain #equals(Object) Equality} of domain names is case-insensitive
059     * with respect to ASCII characters, so for convenience, the {@link #name()} and
060     * {@link #parts()} methods return string with all ASCII characters converted to
061     * lowercase.
062     *
063     * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name">
064     * internationalized domain names</a> such as {@code 网络.cn} are
065     * supported, but with much weaker syntactic validation (resulting in false
066     * positive reports of validity).
067     *
068     * @author Craig Berry
069     * @since 5
070     */
071    @Beta
072    @GwtCompatible(emulated = true)
073    public final class InternetDomainName {
074    
075      private static final CharMatcher DOTS_MATCHER =
076          CharMatcher.anyOf(".\u3002\uFF0E\uFF61");
077      private static final Splitter DOT_SPLITTER = Splitter.on('.');
078      private static final Joiner DOT_JOINER = Joiner.on('.');
079    
080      /**
081       * Value of {@link #publicSuffixIndex} which indicates that no public suffix
082       * was found.
083       */
084      private static final int NO_PUBLIC_SUFFIX_FOUND = -1;
085    
086      private static final String DOT_REGEX = "\\.";
087    
088      /**
089       * The full domain name, converted to lower case.
090       */
091      private final String name;
092    
093      /**
094       * The parts of the domain name, converted to lower case.
095       */
096      private final ImmutableList<String> parts;
097    
098      /**
099       * The index in the {@link #parts()} list at which the public suffix begins.
100       * For example, for the domain name {@code www.google.co.uk}, the value would
101       * be 2 (the index of the {@code co} part). The value is negative
102       * (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was
103       * found.
104       */
105      private final int publicSuffixIndex;
106    
107      /**
108       * Private constructor used to implement {@link #fromLenient(String)}.
109       */
110      private InternetDomainName(String name) {
111        // Normalize all dot-like characters to '.', and strip trailing '.'.
112    
113        name = DOTS_MATCHER.replaceFrom(name, '.');
114    
115        if (name.endsWith(".")) {
116          name = name.substring(0, name.length() - 1);
117        }
118    
119        this.name = name;
120        this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name));
121        checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name);
122        this.publicSuffixIndex = findPublicSuffix();
123      }
124    
125      /**
126       * Private constructor used to implement {@link #ancestor(int)}. Argument
127       * parts are assumed to be valid, as they always come from an existing domain.
128       */
129      private InternetDomainName(List<String> parts) {
130        checkArgument(!parts.isEmpty());
131    
132        this.parts = ImmutableList.copyOf(parts);
133        this.name = DOT_JOINER.join(parts);
134        this.publicSuffixIndex = findPublicSuffix();
135      }
136    
137      /**
138       * Returns the index of the leftmost part of the public suffix, or -1 if not
139       * found. Note that the value defined as the "public suffix" may not be a
140       * public suffix according to {@link #isPublicSuffix()} if the domain ends
141       * with an excluded domain pattern such as "nhs.uk".
142       */
143      private int findPublicSuffix() {
144        final int partsSize = parts.size();
145    
146        for (int i = 0; i < partsSize; i++) {
147          String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize));
148    
149          if (TldPatterns.EXACT.contains(ancestorName)) {
150            return i;
151          }
152    
153          // Excluded domains (e.g. !nhs.uk) use the next highest
154          // domain as the effective public suffix (e.g. uk).
155    
156          if (TldPatterns.EXCLUDED.contains(ancestorName)) {
157            return i + 1;
158          }
159    
160          if (matchesWildcardPublicSuffix(ancestorName)) {
161            return i;
162          }
163        }
164    
165        return NO_PUBLIC_SUFFIX_FOUND;
166      }
167    
168      /**
169       * A factory method for creating {@code InternetDomainName} objects. Only
170       * lenient validation of the domain is performed. Specifically,
171       * validation against
172       * <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
173       * ("Internationalizing Domain Names in Applications") is not performed.
174       *
175       * @param domain A domain name (not IP address)
176       * @throws IllegalArgumentException If name is not syntactically valid
177       * @since 8 (previously named {@code from})
178       */
179      public static InternetDomainName fromLenient(String domain) {
180        /*
181         * RFC 1035 defines ASCII components of domain names to be case-insensitive;
182         * normalizing ASCII characters to lower case allows us to simplify matching
183         * and support more robust equality testing.
184         */
185        return new InternetDomainName(Ascii.toLowerCase(checkNotNull(domain)));
186      }
187    
188      /**
189       * Validation method used by {@from} to ensure that the domain name is
190       * syntactically valid according to RFC 1035.
191       *
192       * @return Is the domain name syntactically valid?
193       */
194      private static boolean validateSyntax(List<String> parts) {
195        final int lastIndex = parts.size() - 1;
196    
197        // Validate the last part specially, as it has different syntax rules.
198    
199        if (!validatePart(parts.get(lastIndex), true)) {
200          return false;
201        }
202    
203        for (int i = 0; i < lastIndex; i++) {
204          String part = parts.get(i);
205          if (!validatePart(part, false)) {
206            return false;
207          }
208        }
209    
210        return true;
211      }
212    
213      /**
214       * The maximum size of a single part of a domain name.
215       */
216      private static final int MAX_DOMAIN_PART_LENGTH = 63;
217    
218      private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_");
219    
220      private static final CharMatcher PART_CHAR_MATCHER =
221          CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER);
222    
223      /**
224       * Helper method for {@link #validateSyntax(List)}. Validates that one part of
225       * a domain name is valid.
226       *
227       * @param part The domain name part to be validated
228       * @param isFinalPart Is this the final (rightmost) domain part?
229       * @return Whether the part is valid
230       */
231      private static boolean validatePart(String part, boolean isFinalPart) {
232    
233        // These tests could be collapsed into one big boolean expression, but
234        // they have been left as independent tests for clarity.
235    
236        if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) {
237          return false;
238        }
239    
240        // GWT claims to support java.lang.Character's char-classification
241        // methods, but it actually only works for ASCII. So for now,
242        // assume anything with non-ASCII characters is valid.
243        // The only place this seems to be documented is here:
244        // http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html
245    
246        if (!CharMatcher.ASCII.matchesAllOf(part)) {
247          return true;
248        }
249    
250        if (!PART_CHAR_MATCHER.matchesAllOf(part)) {
251          return false;
252        }
253    
254        if (DASH_MATCHER.matches(part.charAt(0))
255            || DASH_MATCHER.matches(part.charAt(part.length() - 1))) {
256          return false;
257        }
258    
259        if (isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) {
260          return false;
261        }
262    
263        return true;
264      }
265    
266      /**
267       * Returns the domain name, normalized to all lower case.
268       */
269      public String name() {
270        return name;
271      }
272    
273      /**
274       * Returns the individual components of this domain name, normalized to all
275       * lower case. For example, for the domain name {@code mail.google.com}, this
276       * method returns the list {@code ["mail", "google", "com"]}.
277       */
278      public ImmutableList<String> parts() {
279        return parts;
280      }
281    
282      /**
283       * Indicates whether this domain name represents a <i>public suffix</i>, as
284       * defined by the Mozilla Foundation's
285       * <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public
286       * suffix is one under which Internet users can directly register names, such
287       * as {@code com}, {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain
288       * names that are <i>not</i> public suffixes include {@code google}, {@code
289       * google.com} and {@code foo.co.uk}.
290       *
291       * @return {@code true} if this domain name appears exactly on the public
292       *     suffix list
293       * @since 6
294       */
295      public boolean isPublicSuffix() {
296        return publicSuffixIndex == 0;
297      }
298    
299      /**
300       * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
301       * public suffix}, including if it is a public suffix itself. For example,
302       * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
303       * {@code com}, but not for {@code google} or {@code google.foo}. This is
304       * the recommended method for determining whether a domain is potentially an
305       * addressable host.
306       *
307       * @since 6
308       */
309      public boolean hasPublicSuffix() {
310        return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND;
311      }
312    
313      /**
314       * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the
315       * domain name, or {@code null} if no public suffix is present.
316       *
317       * @since 6
318       */
319      public InternetDomainName publicSuffix() {
320        return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null;
321      }
322    
323      /**
324       * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
325       * public suffix}, while not being a public suffix itself. For example,
326       * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
327       * {@code bar.ca.us}, but not for {@code google}, {@code com}, or {@code
328       * google.foo}.
329       *
330       * <p><b>Warning:</b> a {@code false} result from this method does not imply
331       * that the domain does not represent an addressable host, as many public
332       * suffixes are also addressable hosts. Use {@link #hasPublicSuffix()} for
333       * that test.
334       *
335       * <p>This method can be used to determine whether it will probably be
336       * possible to set cookies on the domain, though even that depends on
337       * individual browsers' implementations of cookie controls. See
338       * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
339       *
340       * @since 6
341       */
342      public boolean isUnderPublicSuffix() {
343        return publicSuffixIndex > 0;
344      }
345    
346      /**
347       * Indicates whether this domain name is composed of exactly one subdomain
348       * component followed by a {@linkplain #isPublicSuffix() public suffix}. For
349       * example, returns {@code true} for {@code google.com} and {@code foo.co.uk},
350       * but not for {@code www.google.com} or {@code co.uk}.
351       *
352       * <p><b>Warning:</b> A {@code true} result from this method does not imply
353       * that the domain is at the highest level which is addressable as a host, as
354       * many public suffixes are also addressable hosts. For example, the domain
355       * {@code bar.uk.com} has a public suffix of {@code uk.com}, so it would
356       * return {@code true} from this method. But {@code uk.com} is itself an
357       * addressable host.
358       *
359       * <p>This method can be used to determine whether a domain is probably the
360       * highest level for which cookies may be set, though even that depends on
361       * individual browsers' implementations of cookie controls. See
362       * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
363       *
364       * @since 6
365       */
366      public boolean isTopPrivateDomain() {
367        return publicSuffixIndex == 1;
368      }
369    
370      /**
371       * Returns the portion of this domain name that is one level beneath the
372       * public suffix. For example, for {@code x.adwords.google.co.uk} it returns
373       * {@code google.co.uk}, since {@code co.uk} is a public suffix.
374       *
375       * <p>If {@link #isTopPrivateDomain()} is true, the current domain name
376       * instance is returned.
377       *
378       * <p>This method should not be used to determine the topmost parent domain
379       * which is addressable as a host, as many public suffixes are also
380       * addressable hosts. For example, the domain {@code foo.bar.uk.com} has
381       * a public suffix of {@code uk.com}, so it would return {@code bar.uk.com}
382       * from this method. But {@code uk.com} is itself an addressable host.
383       *
384       * <p>This method can be used to determine the probable highest level parent
385       * domain for which cookies may be set, though even that depends on individual
386       * browsers' implementations of cookie controls.
387       *
388       * @throws IllegalStateException if this domain does not end with a
389       *     public suffix
390       * @since 6
391       */
392      public InternetDomainName topPrivateDomain() {
393        if (isTopPrivateDomain()) {
394          return this;
395        }
396        checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name);
397        return ancestor(publicSuffixIndex - 1);
398      }
399    
400      /**
401       * Indicates whether this domain is composed of two or more parts.
402       */
403      public boolean hasParent() {
404        return parts.size() > 1;
405      }
406    
407      /**
408       * Returns an {@code InternetDomainName} that is the immediate ancestor of
409       * this one; that is, the current domain with the leftmost part removed. For
410       * example, the parent of {@code www.google.com} is {@code google.com}.
411       *
412       * @throws IllegalStateException if the domain has no parent, as determined
413       *     by {@link #hasParent}
414       */
415      public InternetDomainName parent() {
416        checkState(hasParent(), "Domain '%s' has no parent", name);
417        return ancestor(1);
418      }
419    
420      /**
421       * Returns the ancestor of the current domain at the given number of levels
422       * "higher" (rightward) in the subdomain list. The number of levels must be
423       * non-negative, and less than {@code N-1}, where {@code N} is the number of
424       * parts in the domain.
425       *
426       * <p>TODO: Reasonable candidate for addition to public API.
427       */
428      private InternetDomainName ancestor(int levels) {
429        return new InternetDomainName(parts.subList(levels, parts.size()));
430      }
431    
432      /**
433       * Creates and returns a new {@code InternetDomainName} by prepending the
434       * argument and a dot to the current name. For example, {@code
435       * InternetDomainName.from("foo.com").child("www.bar")} returns a new {@code
436       * InternetDomainName} with the value {@code www.bar.foo.com}.
437       *
438       * @throws NullPointerException if leftParts is null
439       * @throws IllegalArgumentException if the resulting name is not valid
440       */
441      public InternetDomainName child(String leftParts) {
442        return InternetDomainName.fromLenient(checkNotNull(leftParts) + "." + name);
443      }
444    
445      /**
446       * Indicates whether the argument is a syntactically valid domain name. Only
447       * lenient validation is done, as described in {@link #fromLenient(String)}.
448       *
449       * <p>This method is intended for the case where a {@link String} must be
450       * validated as a valid domain name, but no further work with that
451       * {@link String} as an {@link InternetDomainName} will be required. Code like
452       * the following will unnecessarily repeat the work of validation:
453       * <pre>   {@code
454       *
455       *   if (InternetDomainName.isValid(name)) {
456       *     domainName = InternetDomainName.from(name);
457       *   } else {
458       *     domainName = DEFAULT_DOMAIN;
459       *   }}</pre>
460       *
461       * Such code could instead be written as follows: <pre>   {@code
462       *
463       *   try {
464       *     domainName = InternetDomainName.from(name);
465       *   } catch (IllegalArgumentException e) {
466       *     domainName = DEFAULT_DOMAIN;
467       *   }}</pre>
468       *
469       * @since 8 (previously named {@code isValid})
470       */
471      public static boolean isValidLenient(String name) {
472        try {
473          fromLenient(name);
474          return true;
475        } catch (IllegalArgumentException e) {
476          return false;
477        }
478      }
479    
480      /**
481       * Does the domain name match one of the "wildcard" patterns (e.g. "*.ar")?
482       */
483      private static boolean matchesWildcardPublicSuffix(String domain) {
484        final String[] pieces = domain.split(DOT_REGEX, 2);
485        return pieces.length == 2 && TldPatterns.UNDER.contains(pieces[1]);
486      }
487    
488      // TODO: specify this to return the same as name(); remove name()
489      @Override
490      public String toString() {
491        return Objects.toStringHelper(this).add("name", name).toString();
492      }
493    
494      @Override
495      public boolean equals(@Nullable Object object) {
496        if (object == this) {
497          return true;
498        }
499    
500        if (object instanceof InternetDomainName) {
501          InternetDomainName that = (InternetDomainName) object;
502          return this.name.equals(that.name);
503        }
504    
505        return false;
506      }
507    
508      @Override
509      public int hashCode() {
510        return name.hashCode();
511      }
512    
513    }