001    /*
002     * Copyright (C) 2009 Google Inc.
003     *
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     *
008     * http://www.apache.org/licenses/LICENSE-2.0
009     *
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package com.google.common.net;
018    
019    import static com.google.common.base.Preconditions.checkArgument;
020    import static com.google.common.base.Preconditions.checkNotNull;
021    import static com.google.common.base.Preconditions.checkState;
022    
023    import com.google.common.annotations.Beta;
024    import com.google.common.annotations.GwtCompatible;
025    import com.google.common.base.CharMatcher;
026    import com.google.common.base.Joiner;
027    import com.google.common.base.Objects;
028    import com.google.common.base.Splitter;
029    import com.google.common.collect.ImmutableList;
030    
031    import java.util.List;
032    
033    import javax.annotation.Nullable;
034    
035    /**
036     * An immutable well-formed internet domain name, as defined by
037     * <a href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a>, with the
038     * exception that names ending in {@code "."} are not supported (as they are not
039     * generally used in browsers, email, and other end-user applications. Examples
040     * include {@code com} and {@code foo.co.uk}. Only syntactic analysis is
041     * performed; no DNS lookups or other network interactions take place. Thus
042     * there is no guarantee that the domain actually exists on the internet.
043     * Invalid domain names throw {@link IllegalArgumentException} on construction.
044     *
045     * <p>It is often the case that domains of interest are those under a
046     * {@linkplain #isPublicSuffix() public suffix} but not themselves a public
047     * suffix; {@link #hasPublicSuffix()} and {@link #isTopPrivateDomain()} test for
048     * this. Similarly, one often needs to obtain the domain consisting of the
049     * public suffix plus one subdomain level, typically to obtain the highest-level
050     * domain for which cookies may be set. Use {@link #topPrivateDomain()} for this
051     * purpose.
052     *
053     * <p>{@linkplain #equals(Object) Equality} of domain names is case-insensitive,
054     * so for convenience, the {@link #name()} and {@link #parts()} methods return
055     * the lowercase form of the name.
056     *
057     * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name">
058     * internationalized domain names (IDN)</a> such as {@code 网络.cn} are
059     * supported.
060     *
061     * @author Craig Berry
062     * @since 5
063     */
064    @Beta
065    @GwtCompatible
066    public final class InternetDomainName {
067      private static final Splitter DOT_SPLITTER = Splitter.on('.');
068      private static final Joiner DOT_JOINER = Joiner.on('.');
069    
070      /**
071       * Value of {@link #publicSuffixIndex} which indicates that no public suffix
072       * was found.
073       */
074      private static final int NO_PUBLIC_SUFFIX_FOUND = -1;
075    
076      private static final String DOT_REGEX = "\\.";
077    
078      /**
079       * The full domain name, converted to lower case.
080       */
081      private final String name;
082    
083      /**
084       * The parts of the domain name, converted to lower case.
085       */
086      private final ImmutableList<String> parts;
087    
088      /**
089       * The index in the {@link #parts()} list at which the public suffix begins.
090       * For example, for the domain name {@code www.google.co.uk}, the value would
091       * be 2 (the index of the {@code co} part). The value is negative
092       * (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was
093       * found.
094       */
095      private final int publicSuffixIndex;
096    
097      /**
098       * Private constructor used to implement {@link #from(String)}.
099       */
100      private InternetDomainName(String name) {
101        this.name = name;
102        this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name));
103        checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name);
104        this.publicSuffixIndex = findPublicSuffix();
105      }
106    
107      /**
108       * Private constructor used to implement {@link #ancestor(int)}. Argument
109       * parts are assumed to be valid, as they always come from an existing domain.
110       */
111      private InternetDomainName(List<String> parts) {
112        checkArgument(!parts.isEmpty());
113    
114        this.parts = ImmutableList.copyOf(parts);
115        this.name = DOT_JOINER.join(parts);
116        this.publicSuffixIndex = findPublicSuffix();
117      }
118    
119      /**
120       * Returns the index of the leftmost part of the public suffix, or -1 if not
121       * found.
122       */
123      private int findPublicSuffix() {
124        final int partsSize = parts.size();
125    
126        for (int i = 0; i < partsSize; i++) {
127          String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize));
128    
129          if (isPublicSuffixInternal(ancestorName)) {
130            return i;
131          }
132        }
133    
134        return NO_PUBLIC_SUFFIX_FOUND;
135      }
136    
137      /**
138       * A factory method for creating {@code InternetDomainName} objects.
139       *
140       * @param domain A domain name (not IP address)
141       * @throws IllegalArgumentException If name is not syntactically valid
142       */
143      public static InternetDomainName from(String domain) {
144        // RFC 1035 defines domain names to be case-insensitive; normalizing
145        // to lower case allows us to simplify matching.
146        return new InternetDomainName(domain.toLowerCase());
147      }
148    
149      // TODO: For the moment, we validate that all parts of a domain
150      // * Start and end with an alphanumeric character
151      // * Have alphanumeric, dash, or underscore characters internally
152      // An additional constraint is that the first character of the last part
153      // may not be numeric.
154      // All of this is a compromise to allow relatively accurate and efficient
155      // checking. We may soon move to using java.net.IDN for this purpose in
156      // non-GWT code.
157    
158      /**
159       * Validation method used by {@from} to ensure that the domain name is
160       * syntactically valid according to RFC 1035.
161       *
162       * @return Is the domain name syntactically valid?
163       */
164      private static boolean validateSyntax(List<String> parts) {
165        final int lastIndex = parts.size() - 1;
166    
167        // Validate the last part specially, as it has different syntax rules.
168    
169        if (!validatePart(parts.get(lastIndex), true)) {
170          return false;
171        }
172    
173        for (int i = 0; i < lastIndex; i++) {
174          String part = parts.get(i);
175          if (!validatePart(part, false)) {
176            return false;
177          }
178        }
179    
180        return true;
181      }
182    
183      /**
184       * The maximum size of a single part of a domain name.
185       */
186      private static final int MAX_DOMAIN_PART_LENGTH = 63;
187    
188      private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_");
189    
190      private static final CharMatcher PART_CHAR_MATCHER =
191          CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER);
192    
193      /**
194       * Helper method for {@link #validateSyntax(List)}. Validates that one part of
195       * a domain name is valid.
196       *
197       * @param part The domain name part to be validated
198       * @param isFinalPart Is this the final (rightmost) domain part?
199       * @return Whether the part is valid
200       */
201      private static boolean validatePart(String part, boolean isFinalPart) {
202    
203        // These tests could be collapsed into one big boolean expression, but
204        // they have been left as independent tests for clarity.
205    
206        if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) {
207          return false;
208        }
209    
210        // GWT claims to support java.lang.Character's char-classification
211        // methods, but it actually only works for ASCII. So for now,
212        // assume anything with non-ASCII characters is valid.
213        // The only place this seems to be documented is here:
214        // http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html
215    
216        if (!CharMatcher.ASCII.matchesAllOf(part)) {
217          return true;
218        }
219    
220        if (!PART_CHAR_MATCHER.matchesAllOf(part)) {
221          return false;
222        }
223    
224        if (DASH_MATCHER.matches(part.charAt(0))
225            || DASH_MATCHER.matches(part.charAt(part.length() - 1))) {
226          return false;
227        }
228    
229        if (isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) {
230          return false;
231        }
232    
233        return true;
234      }
235    
236      /**
237       * Returns the domain name, normalized to all lower case.
238       */
239      public String name() {
240        return name;
241      }
242    
243      /**
244       * Returns the individual components of this domain name, normalized to all
245       * lower case. For example, for the domain name {@code mail.google.com}, this
246       * method returns the list {@code ["mail", "google", "com"]}.
247       */
248      public ImmutableList<String> parts() {
249        return parts;
250      }
251    
252      /**
253       * Old location of {@link #isPublicSuffix()}.
254       *
255       * @deprecated use {@link #isPublicSuffix()}
256       */
257      @Deprecated public boolean isRecognizedTld() {
258        return isPublicSuffix();
259      }
260    
261      /**
262       * Old location of {@link #isUnderPublicSuffix()}.
263       *
264       * @deprecated use {@link #isUnderPublicSuffix()}
265       */
266      @Deprecated public boolean isUnderRecognizedTld() {
267        return isUnderPublicSuffix();
268      }
269    
270      /**
271       * Old location of {@link #hasPublicSuffix()}.
272       *
273       * @deprecated use {@link #hasPublicSuffix()}
274       */
275      @Deprecated public boolean hasRecognizedTld() {
276        return hasPublicSuffix();
277      }
278    
279      /**
280       * Old location of {@link #publicSuffix()}.
281       *
282       * @deprecated use {@link #publicSuffix()}
283       */
284      @Deprecated public InternetDomainName recognizedTld() {
285        return publicSuffix();
286      }
287    
288      /**
289       * Old location of {@link #isTopPrivateDomain()}.
290       *
291       * @deprecated use {@link #isTopPrivateDomain()}
292       */
293      @Deprecated public boolean isImmediatelyUnderTld() {
294        return isTopPrivateDomain();
295      }
296    
297      /**
298       * Old location of {@link #topPrivateDomain()}.
299       *
300       * @deprecated use {@link #topPrivateDomain()}
301       */
302      @Deprecated public InternetDomainName topCookieDomain() {
303        return topPrivateDomain();
304      }
305    
306      /**
307       * Returns the rightmost non-{@linkplain #isRecognizedTld() TLD} domain name
308       * part.  For example
309       * {@code new InternetDomainName("www.google.com").rightmostNonTldPart()}
310       * returns {@code "google"}.  Returns null if either no
311       * {@linkplain #isRecognizedTld() TLD} is found, or the whole domain name is
312       * itself a {@linkplain #isRecognizedTld() TLD}.
313       *
314       * @deprecated use the first {@linkplain #parts part} of the {@link
315       *     #topPrivateDomain()}
316       */
317      @Deprecated public String rightmostNonTldPart() {
318        return publicSuffixIndex >= 1
319            ? parts.get(publicSuffixIndex - 1)
320            : null;
321      }
322    
323      /**
324       * Indicates whether this domain name represents a <i>public suffix</i>, as
325       * defined by the Mozilla Foundation's
326       * <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public
327       * suffix is one under which Internet users can directly register names, such
328       * as {@code com}, {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain
329       * names that are <i>not</i> public suffixes include {@code google}, {@code
330       * google.com} and {@code foo.co.uk}.
331       *
332       * @return {@code true} if this domain name appears exactly on the public
333       *     suffix list
334       * @since 6
335       */
336      public boolean isPublicSuffix() {
337        return publicSuffixIndex == 0;
338      }
339    
340      /**
341       * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
342       * public suffix}, including if it is a public suffix itself. For example,
343       * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
344       * {@code com}, but not for {@code google} or {@code google.foo}.
345       *
346       * @since 6
347       */
348      public boolean hasPublicSuffix() {
349        return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND;
350      }
351    
352      /**
353       * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the
354       * domain name, or {@code null} if no public suffix is present.
355       *
356       * @since 6
357       */
358      public InternetDomainName publicSuffix() {
359        return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null;
360      }
361    
362      /**
363       * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
364       * public suffix}, while not being a public suffix itself. For example,
365       * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
366       * {@code bar.ca.us}, but not for {@code google}, {@code com}, or {@code
367       * google.foo}.
368       *
369       * @since 6
370       */
371      public boolean isUnderPublicSuffix() {
372        return publicSuffixIndex > 0;
373      }
374    
375      /**
376       * Indicates whether this domain name is composed of exactly one subdomain
377       * component followed by a {@linkplain #isPublicSuffix() public suffix}. For
378       * example, returns {@code true} for {@code google.com} and {@code foo.co.uk},
379       * but not for {@code www.google.com} or {@code co.uk}.
380       *
381       * @since 6
382       */
383      public boolean isTopPrivateDomain() {
384        return publicSuffixIndex == 1;
385      }
386    
387      /**
388       * Returns the portion of this domain name that is one level beneath the
389       * public suffix. For example, for {@code x.adwords.google.co.uk} it returns
390       * {@code google.co.uk}, since {@code co.uk} is a public suffix. This is the
391       * highest-level parent of this domain for which cookies may be set, as
392       * cookies cannot be set on a public suffix itself.
393       *
394       * <p>If {@link #isTopPrivateDomain()} is true, the current domain name
395       * instance is returned.
396       *
397       * @throws IllegalStateException if this domain does not end with a
398       *     public suffix
399       * @since 6
400       */
401      public InternetDomainName topPrivateDomain() {
402        if (isTopPrivateDomain()) {
403          return this;
404        }
405        checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name);
406        return ancestor(publicSuffixIndex - 1);
407      }
408    
409      /**
410       * Indicates whether this domain is composed of two or more parts.
411       */
412      public boolean hasParent() {
413        return parts.size() > 1;
414      }
415    
416      /**
417       * Returns an {@code InternetDomainName} that is the immediate ancestor of
418       * this one; that is, the current domain with the leftmost part removed. For
419       * example, the parent of {@code www.google.com} is {@code google.com}.
420       *
421       * @throws IllegalStateException if the domain has no parent, as determined
422       *     by {@link #hasParent}
423       */
424      public InternetDomainName parent() {
425        checkState(hasParent(), "Domain '%s' has no parent", name);
426        return ancestor(1);
427      }
428    
429      /**
430       * Returns the ancestor of the current domain at the given number of levels
431       * "higher" (rightward) in the subdomain list. The number of levels must be
432       * non-negative, and less than {@code N-1}, where {@code N} is the number of
433       * parts in the domain.
434       *
435       * <p>TODO: Reasonable candidate for addition to public API.
436       */
437      private InternetDomainName ancestor(int levels) {
438        return new InternetDomainName(parts.subList(levels, parts.size()));
439      }
440    
441      /**
442       * Creates and returns a new {@code InternetDomainName} by prepending the
443       * argument and a dot to the current name. For example, {@code
444       * InternetDomainName.from("foo.com").child("www.bar")} returns a new {@code
445       * InternetDomainName} with the value {@code www.bar.foo.com}.
446       *
447       * @throws NullPointerException if leftParts is null
448       * @throws IllegalArgumentException if the resulting name is not valid
449       */
450      public InternetDomainName child(String leftParts) {
451        return InternetDomainName.from(checkNotNull(leftParts) + "." + name);
452      }
453    
454      /**
455       * Indicates whether the argument is a syntactically valid domain name.  This
456       * method is intended for the case where a {@link String} must be validated as
457       * a valid domain name, but no further work with that {@link String} as an
458       * {@link InternetDomainName} will be required. Code like the following will
459       * unnecessarily repeat the work of validation: <pre>   {@code
460       *
461       *   if (InternetDomainName.isValid(name)) {
462       *     domainName = InternetDomainName.from(name);
463       *   } else {
464       *     domainName = DEFAULT_DOMAIN;
465       *   }}</pre>
466       *
467       * Such code could instead be written as follows: <pre>   {@code
468       *
469       *   try {
470       *     domainName = InternetDomainName.from(name);
471       *   } catch (IllegalArgumentException e) {
472       *     domainName = DEFAULT_DOMAIN;
473       *   }}</pre>
474       */
475      public static boolean isValid(String name) {
476        try {
477          from(name);
478          return true;
479        } catch (IllegalArgumentException e) {
480          return false;
481        }
482      }
483    
484      /**
485       * Does the domain name satisfy the Mozilla criteria for a {@linkplain
486       * #isPublicSuffix() public suffix}?
487       */
488      private static boolean isPublicSuffixInternal(String domain) {
489        return TldPatterns.EXACT.contains(domain)
490            || (!TldPatterns.EXCLUDED.contains(domain)
491                && matchesWildcardPublicSuffix(domain));
492      }
493    
494      /**
495       * Does the domain name match one of the "wildcard" patterns (e.g. "*.ar")?
496       */
497      private static boolean matchesWildcardPublicSuffix(String domain) {
498        final String[] pieces = domain.split(DOT_REGEX, 2);
499        return pieces.length == 2 && TldPatterns.UNDER.contains(pieces[1]);
500      }
501    
502      // TODO: specify this to return the same as name(); remove name()
503      @Override
504      public String toString() {
505        return Objects.toStringHelper(this).add("name", name).toString();
506      }
507    
508      @Override
509      public boolean equals(@Nullable Object object) {
510        if (object == this) {
511          return true;
512        }
513    
514        if (object instanceof InternetDomainName) {
515          InternetDomainName that = (InternetDomainName) object;
516          return this.name.equals(that.name);
517        }
518    
519        return false;
520      }
521    
522      @Override
523      public int hashCode() {
524        return name.hashCode();
525      }
526    }