001 /*
002 * Copyright (C) 2009 Google Inc.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017 package com.google.common.net;
018
019 import static com.google.common.base.Preconditions.checkArgument;
020 import static com.google.common.base.Preconditions.checkNotNull;
021 import static com.google.common.base.Preconditions.checkState;
022
023 import com.google.common.annotations.Beta;
024 import com.google.common.annotations.GwtCompatible;
025 import com.google.common.base.Ascii;
026 import com.google.common.base.CharMatcher;
027 import com.google.common.base.Joiner;
028 import com.google.common.base.Objects;
029 import com.google.common.base.Splitter;
030 import com.google.common.collect.ImmutableList;
031
032 import java.util.List;
033
034 import javax.annotation.Nullable;
035
036 /**
037 * An immutable well-formed internet domain name, as defined by
038 * <a href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a>.
039 * Examples include {@code com} and {@code foo.co.uk}. Only syntactic analysis
040 * is performed; no DNS lookups or other network interactions take place. Thus
041 * there is no guarantee that the domain actually exists on the internet.
042 * Invalid domain names throw {@link IllegalArgumentException} on construction.
043 *
044 * <p>One common use of this class is to determine whether a given string is
045 * likely to represent an addressable domain on the web -- that is, for a
046 * candidate string "xxx", might browsing to "http://xxx/" result in a webpage
047 * being displayed? In the past, this test was frequently done by determining
048 * whether the domain ended with a {@linkplain #isPublicSuffix() public suffix}
049 * but was not itself a public suffix. However, this test is no longer accurate;
050 * there are many domains which are both public suffixes and addressable as
051 * hosts. "uk.com" is one example. As a result, the only useful test to
052 * determine if a domain is a plausible web host is {@link #hasPublicSuffix()}.
053 * This will return {@code true} for many domains which (currently) are not
054 * hosts, such as "com"), but given that any public suffix may become
055 * a host without warning, it is better to err on the side of permissiveness
056 * and thus avoid spurious rejection of valid sites.
057 *
058 * <p>{@linkplain #equals(Object) Equality} of domain names is case-insensitive
059 * with respect to ASCII characters, so for convenience, the {@link #name()} and
060 * {@link #parts()} methods return string with all ASCII characters converted to
061 * lowercase.
062 *
063 * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name">
064 * internationalized domain names</a> such as {@code 网络.cn} are
065 * supported, but with much weaker syntactic validation (resulting in false
066 * positive reports of validity).
067 *
068 * @author Craig Berry
069 * @since 5
070 */
071 @Beta
072 @GwtCompatible(emulated = true)
073 public final class InternetDomainName {
074
075 private static final CharMatcher DOTS_MATCHER =
076 CharMatcher.anyOf(".\u3002\uFF0E\uFF61");
077 private static final Splitter DOT_SPLITTER = Splitter.on('.');
078 private static final Joiner DOT_JOINER = Joiner.on('.');
079
080 /**
081 * Value of {@link #publicSuffixIndex} which indicates that no public suffix
082 * was found.
083 */
084 private static final int NO_PUBLIC_SUFFIX_FOUND = -1;
085
086 private static final String DOT_REGEX = "\\.";
087
088 /**
089 * The full domain name, converted to lower case.
090 */
091 private final String name;
092
093 /**
094 * The parts of the domain name, converted to lower case.
095 */
096 private final ImmutableList<String> parts;
097
098 /**
099 * The index in the {@link #parts()} list at which the public suffix begins.
100 * For example, for the domain name {@code www.google.co.uk}, the value would
101 * be 2 (the index of the {@code co} part). The value is negative
102 * (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was
103 * found.
104 */
105 private final int publicSuffixIndex;
106
107 /**
108 * Private constructor used to implement {@link #fromLenient(String)}.
109 */
110 private InternetDomainName(String name) {
111 // Normalize all dot-like characters to '.', and strip trailing '.'.
112
113 name = DOTS_MATCHER.replaceFrom(name, '.');
114
115 if (name.endsWith(".")) {
116 name = name.substring(0, name.length() - 1);
117 }
118
119 this.name = name;
120 this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name));
121 checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name);
122 this.publicSuffixIndex = findPublicSuffix();
123 }
124
125 /**
126 * Private constructor used to implement {@link #ancestor(int)}. Argument
127 * parts are assumed to be valid, as they always come from an existing domain.
128 */
129 private InternetDomainName(List<String> parts) {
130 checkArgument(!parts.isEmpty());
131
132 this.parts = ImmutableList.copyOf(parts);
133 this.name = DOT_JOINER.join(parts);
134 this.publicSuffixIndex = findPublicSuffix();
135 }
136
137 /**
138 * Returns the index of the leftmost part of the public suffix, or -1 if not
139 * found. Note that the value defined as the "public suffix" may not be a
140 * public suffix according to {@link #isPublicSuffix()} if the domain ends
141 * with an excluded domain pattern such as "nhs.uk".
142 */
143 private int findPublicSuffix() {
144 final int partsSize = parts.size();
145
146 for (int i = 0; i < partsSize; i++) {
147 String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize));
148
149 if (TldPatterns.EXACT.contains(ancestorName)) {
150 return i;
151 }
152
153 // Excluded domains (e.g. !nhs.uk) use the next highest
154 // domain as the effective public suffix (e.g. uk).
155
156 if (TldPatterns.EXCLUDED.contains(ancestorName)) {
157 return i + 1;
158 }
159
160 if (matchesWildcardPublicSuffix(ancestorName)) {
161 return i;
162 }
163 }
164
165 return NO_PUBLIC_SUFFIX_FOUND;
166 }
167
168 /**
169 * A factory method for creating {@code InternetDomainName} objects. Only
170 * lenient validation of the domain is performed. Specifically,
171 * validation against
172 * <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
173 * ("Internationalizing Domain Names in Applications") is not performed.
174 *
175 * @param domain A domain name (not IP address)
176 * @throws IllegalArgumentException If name is not syntactically valid
177 * @since 8 (previously named {@code from})
178 */
179 public static InternetDomainName fromLenient(String domain) {
180 /*
181 * RFC 1035 defines ASCII components of domain names to be case-insensitive;
182 * normalizing ASCII characters to lower case allows us to simplify matching
183 * and support more robust equality testing.
184 */
185 return new InternetDomainName(Ascii.toLowerCase(checkNotNull(domain)));
186 }
187
188 /**
189 * Validation method used by {@from} to ensure that the domain name is
190 * syntactically valid according to RFC 1035.
191 *
192 * @return Is the domain name syntactically valid?
193 */
194 private static boolean validateSyntax(List<String> parts) {
195 final int lastIndex = parts.size() - 1;
196
197 // Validate the last part specially, as it has different syntax rules.
198
199 if (!validatePart(parts.get(lastIndex), true)) {
200 return false;
201 }
202
203 for (int i = 0; i < lastIndex; i++) {
204 String part = parts.get(i);
205 if (!validatePart(part, false)) {
206 return false;
207 }
208 }
209
210 return true;
211 }
212
213 /**
214 * The maximum size of a single part of a domain name.
215 */
216 private static final int MAX_DOMAIN_PART_LENGTH = 63;
217
218 private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_");
219
220 private static final CharMatcher PART_CHAR_MATCHER =
221 CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER);
222
223 /**
224 * Helper method for {@link #validateSyntax(List)}. Validates that one part of
225 * a domain name is valid.
226 *
227 * @param part The domain name part to be validated
228 * @param isFinalPart Is this the final (rightmost) domain part?
229 * @return Whether the part is valid
230 */
231 private static boolean validatePart(String part, boolean isFinalPart) {
232
233 // These tests could be collapsed into one big boolean expression, but
234 // they have been left as independent tests for clarity.
235
236 if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) {
237 return false;
238 }
239
240 // GWT claims to support java.lang.Character's char-classification
241 // methods, but it actually only works for ASCII. So for now,
242 // assume anything with non-ASCII characters is valid.
243 // The only place this seems to be documented is here:
244 // http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html
245
246 if (!CharMatcher.ASCII.matchesAllOf(part)) {
247 return true;
248 }
249
250 if (!PART_CHAR_MATCHER.matchesAllOf(part)) {
251 return false;
252 }
253
254 if (DASH_MATCHER.matches(part.charAt(0))
255 || DASH_MATCHER.matches(part.charAt(part.length() - 1))) {
256 return false;
257 }
258
259 if (isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) {
260 return false;
261 }
262
263 return true;
264 }
265
266 /**
267 * Returns the domain name, normalized to all lower case.
268 */
269 public String name() {
270 return name;
271 }
272
273 /**
274 * Returns the individual components of this domain name, normalized to all
275 * lower case. For example, for the domain name {@code mail.google.com}, this
276 * method returns the list {@code ["mail", "google", "com"]}.
277 */
278 public ImmutableList<String> parts() {
279 return parts;
280 }
281
282 /**
283 * Indicates whether this domain name represents a <i>public suffix</i>, as
284 * defined by the Mozilla Foundation's
285 * <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public
286 * suffix is one under which Internet users can directly register names, such
287 * as {@code com}, {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain
288 * names that are <i>not</i> public suffixes include {@code google}, {@code
289 * google.com} and {@code foo.co.uk}.
290 *
291 * @return {@code true} if this domain name appears exactly on the public
292 * suffix list
293 * @since 6
294 */
295 public boolean isPublicSuffix() {
296 return publicSuffixIndex == 0;
297 }
298
299 /**
300 * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
301 * public suffix}, including if it is a public suffix itself. For example,
302 * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
303 * {@code com}, but not for {@code google} or {@code google.foo}. This is
304 * the recommended method for determining whether a domain is potentially an
305 * addressable host.
306 *
307 * @since 6
308 */
309 public boolean hasPublicSuffix() {
310 return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND;
311 }
312
313 /**
314 * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the
315 * domain name, or {@code null} if no public suffix is present.
316 *
317 * @since 6
318 */
319 public InternetDomainName publicSuffix() {
320 return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null;
321 }
322
323 /**
324 * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
325 * public suffix}, while not being a public suffix itself. For example,
326 * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
327 * {@code bar.ca.us}, but not for {@code google}, {@code com}, or {@code
328 * google.foo}.
329 *
330 * <p><b>Warning:</b> a {@code false} result from this method does not imply
331 * that the domain does not represent an addressable host, as many public
332 * suffixes are also addressable hosts. Use {@link #hasPublicSuffix()} for
333 * that test.
334 *
335 * <p>This method can be used to determine whether it will probably be
336 * possible to set cookies on the domain, though even that depends on
337 * individual browsers' implementations of cookie controls. See
338 * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
339 *
340 * @since 6
341 */
342 public boolean isUnderPublicSuffix() {
343 return publicSuffixIndex > 0;
344 }
345
346 /**
347 * Indicates whether this domain name is composed of exactly one subdomain
348 * component followed by a {@linkplain #isPublicSuffix() public suffix}. For
349 * example, returns {@code true} for {@code google.com} and {@code foo.co.uk},
350 * but not for {@code www.google.com} or {@code co.uk}.
351 *
352 * <p><b>Warning:</b> A {@code true} result from this method does not imply
353 * that the domain is at the highest level which is addressable as a host, as
354 * many public suffixes are also addressable hosts. For example, the domain
355 * {@code bar.uk.com} has a public suffix of {@code uk.com}, so it would
356 * return {@code true} from this method. But {@code uk.com} is itself an
357 * addressable host.
358 *
359 * <p>This method can be used to determine whether a domain is probably the
360 * highest level for which cookies may be set, though even that depends on
361 * individual browsers' implementations of cookie controls. See
362 * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
363 *
364 * @since 6
365 */
366 public boolean isTopPrivateDomain() {
367 return publicSuffixIndex == 1;
368 }
369
370 /**
371 * Returns the portion of this domain name that is one level beneath the
372 * public suffix. For example, for {@code x.adwords.google.co.uk} it returns
373 * {@code google.co.uk}, since {@code co.uk} is a public suffix.
374 *
375 * <p>If {@link #isTopPrivateDomain()} is true, the current domain name
376 * instance is returned.
377 *
378 * <p>This method should not be used to determine the topmost parent domain
379 * which is addressable as a host, as many public suffixes are also
380 * addressable hosts. For example, the domain {@code foo.bar.uk.com} has
381 * a public suffix of {@code uk.com}, so it would return {@code bar.uk.com}
382 * from this method. But {@code uk.com} is itself an addressable host.
383 *
384 * <p>This method can be used to determine the probable highest level parent
385 * domain for which cookies may be set, though even that depends on individual
386 * browsers' implementations of cookie controls.
387 *
388 * @throws IllegalStateException if this domain does not end with a
389 * public suffix
390 * @since 6
391 */
392 public InternetDomainName topPrivateDomain() {
393 if (isTopPrivateDomain()) {
394 return this;
395 }
396 checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name);
397 return ancestor(publicSuffixIndex - 1);
398 }
399
400 /**
401 * Indicates whether this domain is composed of two or more parts.
402 */
403 public boolean hasParent() {
404 return parts.size() > 1;
405 }
406
407 /**
408 * Returns an {@code InternetDomainName} that is the immediate ancestor of
409 * this one; that is, the current domain with the leftmost part removed. For
410 * example, the parent of {@code www.google.com} is {@code google.com}.
411 *
412 * @throws IllegalStateException if the domain has no parent, as determined
413 * by {@link #hasParent}
414 */
415 public InternetDomainName parent() {
416 checkState(hasParent(), "Domain '%s' has no parent", name);
417 return ancestor(1);
418 }
419
420 /**
421 * Returns the ancestor of the current domain at the given number of levels
422 * "higher" (rightward) in the subdomain list. The number of levels must be
423 * non-negative, and less than {@code N-1}, where {@code N} is the number of
424 * parts in the domain.
425 *
426 * <p>TODO: Reasonable candidate for addition to public API.
427 */
428 private InternetDomainName ancestor(int levels) {
429 return new InternetDomainName(parts.subList(levels, parts.size()));
430 }
431
432 /**
433 * Creates and returns a new {@code InternetDomainName} by prepending the
434 * argument and a dot to the current name. For example, {@code
435 * InternetDomainName.from("foo.com").child("www.bar")} returns a new {@code
436 * InternetDomainName} with the value {@code www.bar.foo.com}.
437 *
438 * @throws NullPointerException if leftParts is null
439 * @throws IllegalArgumentException if the resulting name is not valid
440 */
441 public InternetDomainName child(String leftParts) {
442 return InternetDomainName.fromLenient(checkNotNull(leftParts) + "." + name);
443 }
444
445 /**
446 * Indicates whether the argument is a syntactically valid domain name. Only
447 * lenient validation is done, as described in {@link #fromLenient(String)}.
448 *
449 * <p>This method is intended for the case where a {@link String} must be
450 * validated as a valid domain name, but no further work with that
451 * {@link String} as an {@link InternetDomainName} will be required. Code like
452 * the following will unnecessarily repeat the work of validation:
453 * <pre> {@code
454 *
455 * if (InternetDomainName.isValid(name)) {
456 * domainName = InternetDomainName.from(name);
457 * } else {
458 * domainName = DEFAULT_DOMAIN;
459 * }}</pre>
460 *
461 * Such code could instead be written as follows: <pre> {@code
462 *
463 * try {
464 * domainName = InternetDomainName.from(name);
465 * } catch (IllegalArgumentException e) {
466 * domainName = DEFAULT_DOMAIN;
467 * }}</pre>
468 *
469 * @since 8 (previously named {@code isValid})
470 */
471 public static boolean isValidLenient(String name) {
472 try {
473 fromLenient(name);
474 return true;
475 } catch (IllegalArgumentException e) {
476 return false;
477 }
478 }
479
480 /**
481 * Does the domain name match one of the "wildcard" patterns (e.g. "*.ar")?
482 */
483 private static boolean matchesWildcardPublicSuffix(String domain) {
484 final String[] pieces = domain.split(DOT_REGEX, 2);
485 return pieces.length == 2 && TldPatterns.UNDER.contains(pieces[1]);
486 }
487
488 // TODO: specify this to return the same as name(); remove name()
489 @Override
490 public String toString() {
491 return Objects.toStringHelper(this).add("name", name).toString();
492 }
493
494 @Override
495 public boolean equals(@Nullable Object object) {
496 if (object == this) {
497 return true;
498 }
499
500 if (object instanceof InternetDomainName) {
501 InternetDomainName that = (InternetDomainName) object;
502 return this.name.equals(that.name);
503 }
504
505 return false;
506 }
507
508 @Override
509 public int hashCode() {
510 return name.hashCode();
511 }
512
513 }