001 /*
002 * Copyright (C) 2009 The Guava Authors
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017 package com.google.common.net;
018
019 import static com.google.common.base.Preconditions.checkArgument;
020 import static com.google.common.base.Preconditions.checkNotNull;
021 import static com.google.common.base.Preconditions.checkState;
022
023 import com.google.common.annotations.Beta;
024 import com.google.common.annotations.GwtCompatible;
025 import com.google.common.base.Ascii;
026 import com.google.common.base.CharMatcher;
027 import com.google.common.base.Joiner;
028 import com.google.common.base.Objects;
029 import com.google.common.base.Splitter;
030 import com.google.common.collect.ImmutableList;
031
032 import java.util.List;
033
034 import javax.annotation.Nullable;
035
036 /**
037 * An immutable well-formed internet domain name, such as {@code com} or {@code
038 * foo.co.uk}. Only syntactic analysis is performed; no DNS lookups or other
039 * network interactions take place. Thus there is no guarantee that the domain
040 * actually exists on the internet.
041 *
042 * <p>One common use of this class is to determine whether a given string is
043 * likely to represent an addressable domain on the web -- that is, for a
044 * candidate string {@code "xxx"}, might browsing to {@code "http://xxx/"}
045 * result in a webpage being displayed? In the past, this test was frequently
046 * done by determining whether the domain ended with a {@linkplain
047 * #isPublicSuffix() public suffix} but was not itself a public suffix. However,
048 * this test is no longer accurate. There are many domains which are both public
049 * suffixes and addressable as hosts; {@code "uk.com"} is one example. As a
050 * result, the only useful test to determine if a domain is a plausible web host
051 * is {@link #hasPublicSuffix()}. This will return {@code true} for many domains
052 * which (currently) are not hosts, such as {@code "com"}), but given that any
053 * public suffix may become a host without warning, it is better to err on the
054 * side of permissiveness and thus avoid spurious rejection of valid sites.
055 *
056 * <p>During construction, names are normalized in two ways:
057 * <ol>
058 * <li>ASCII uppercase characters are converted to lowercase.
059 * <li>Unicode dot separators other than the ASCII period ({@code '.'}) are
060 * converted to the ASCII period.
061 * </ol>
062 * The normalized values will be returned from {@link #name()} and
063 * {@link #parts()}, and will be reflected in the result of
064 * {@link #equals(Object)}.
065 *
066 * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name">
067 * internationalized domain names</a> such as {@code 网络.cn} are supported, as
068 * are the equivalent <a
069 * href="http://en.wikipedia.org/wiki/Internationalized_domain_name">IDNA
070 * Punycode-encoded</a> versions.
071 *
072 * @author Craig Berry
073 * @since 5
074 */
075 @Beta
076 @GwtCompatible(emulated = true)
077 public class InternetDomainName {
078
079 private static final CharMatcher DOTS_MATCHER =
080 CharMatcher.anyOf(".\u3002\uFF0E\uFF61");
081 private static final Splitter DOT_SPLITTER = Splitter.on('.');
082 private static final Joiner DOT_JOINER = Joiner.on('.');
083
084 /**
085 * Value of {@link #publicSuffixIndex} which indicates that no public suffix
086 * was found.
087 */
088 private static final int NO_PUBLIC_SUFFIX_FOUND = -1;
089
090 private static final String DOT_REGEX = "\\.";
091
092 /**
093 * Maximum parts (labels) in a domain name. This value arises from
094 * the 255-octet limit described in
095 * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11 with
096 * the fact that the encoding of each part occupies at least two bytes
097 * (dot plus label externally, length byte plus label internally). Thus, if
098 * all labels have the minimum size of one byte, 127 of them will fit.
099 */
100 private static final int MAX_PARTS = 127;
101
102 /**
103 * Maximum length of a full domain name, including separators, and
104 * leaving room for the root label. See
105 * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
106 */
107 private static final int MAX_LENGTH = 253;
108
109 /**
110 * Maximum size of a single part of a domain name. See
111 * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
112 */
113 private static final int MAX_DOMAIN_PART_LENGTH = 63;
114
115 /**
116 * The full domain name, converted to lower case.
117 */
118 private final String name;
119
120 /**
121 * The parts of the domain name, converted to lower case.
122 */
123 private final ImmutableList<String> parts;
124
125 /**
126 * The index in the {@link #parts()} list at which the public suffix begins.
127 * For example, for the domain name {@code www.google.co.uk}, the value would
128 * be 2 (the index of the {@code co} part). The value is negative
129 * (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was
130 * found.
131 */
132 private final int publicSuffixIndex;
133
134 /**
135 * Private constructor used to implement {@link #fromLenient(String)}.
136 */
137 private InternetDomainName(String name) {
138 // Normalize:
139 // * ASCII characters to lowercase
140 // * All dot-like characters to '.'
141 // * Strip trailing '.'
142
143 name = Ascii.toLowerCase(DOTS_MATCHER.replaceFrom(name, '.'));
144
145 if (name.endsWith(".")) {
146 name = name.substring(0, name.length() - 1);
147 }
148
149 checkArgument(name.length() <= MAX_LENGTH, "Domain name too long: '%s':", name);
150 this.name = name;
151
152 this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name));
153 checkArgument(parts.size() <= MAX_PARTS, "Domain has too many parts: '%s'", name);
154 checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name);
155
156 this.publicSuffixIndex = findPublicSuffix();
157 }
158
159 /**
160 * Returns the index of the leftmost part of the public suffix, or -1 if not
161 * found. Note that the value defined as the "public suffix" may not be a
162 * public suffix according to {@link #isPublicSuffix()} if the domain ends
163 * with an excluded domain pattern such as {@code "nhs.uk"}.
164 */
165 private int findPublicSuffix() {
166 final int partsSize = parts.size();
167
168 for (int i = 0; i < partsSize; i++) {
169 String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize));
170
171 if (TldPatterns.EXACT.contains(ancestorName)) {
172 return i;
173 }
174
175 // Excluded domains (e.g. !nhs.uk) use the next highest
176 // domain as the effective public suffix (e.g. uk).
177
178 if (TldPatterns.EXCLUDED.contains(ancestorName)) {
179 return i + 1;
180 }
181
182 if (matchesWildcardPublicSuffix(ancestorName)) {
183 return i;
184 }
185 }
186
187 return NO_PUBLIC_SUFFIX_FOUND;
188 }
189
190 /**
191 * Returns an instance of {@link InternetDomainName} after lenient
192 * validation. Specifically, validation against <a
193 * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
194 * ("Internationalizing Domain Names in Applications") is skipped, while
195 * validation against <a
196 * href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a> is relaxed in
197 * the following ways:
198 * <ul>
199 * <li>Any part containing non-ASCII characters is considered valid.
200 * <li>Underscores ('_') are permitted wherever dashes ('-') are permitted.
201 * <li>Parts other than the final part may start with a digit.
202 * </ul>
203 *
204 * @param domain A domain name (not IP address)
205 * @throws IllegalArgumentException if {@code name} is not syntactically valid
206 * according to {@link #isValidLenient}
207 * @since 8 (previously named {@code from})
208 */
209 public static InternetDomainName fromLenient(String domain) {
210 return new InternetDomainName(checkNotNull(domain));
211 }
212
213 /**
214 * Validation method used by {@from} to ensure that the domain name is
215 * syntactically valid according to RFC 1035.
216 *
217 * @return Is the domain name syntactically valid?
218 */
219 private static boolean validateSyntax(List<String> parts) {
220 final int lastIndex = parts.size() - 1;
221
222 // Validate the last part specially, as it has different syntax rules.
223
224 if (!validatePart(parts.get(lastIndex), true)) {
225 return false;
226 }
227
228 for (int i = 0; i < lastIndex; i++) {
229 String part = parts.get(i);
230 if (!validatePart(part, false)) {
231 return false;
232 }
233 }
234
235 return true;
236 }
237
238 private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_");
239
240 private static final CharMatcher PART_CHAR_MATCHER =
241 CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER);
242
243 /**
244 * Helper method for {@link #validateSyntax(List)}. Validates that one part of
245 * a domain name is valid.
246 *
247 * @param part The domain name part to be validated
248 * @param isFinalPart Is this the final (rightmost) domain part?
249 * @return Whether the part is valid
250 */
251 private static boolean validatePart(String part, boolean isFinalPart) {
252
253 // These tests could be collapsed into one big boolean expression, but
254 // they have been left as independent tests for clarity.
255
256 if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) {
257 return false;
258 }
259
260 /*
261 * GWT claims to support java.lang.Character's char-classification methods,
262 * but it actually only works for ASCII. So for now, assume any non-ASCII
263 * characters are valid. The only place this seems to be documented is here:
264 * http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html
265 *
266 * <p>ASCII characters in the part are expected to be valid per RFC 1035,
267 * with underscore also being allowed due to widespread practice.
268 */
269
270 String asciiChars = CharMatcher.ASCII.retainFrom(part);
271
272 if (!PART_CHAR_MATCHER.matchesAllOf(asciiChars)) {
273 return false;
274 }
275
276 // No initial or final dashes or underscores.
277
278 if (DASH_MATCHER.matches(part.charAt(0))
279 || DASH_MATCHER.matches(part.charAt(part.length() - 1))) {
280 return false;
281 }
282
283 /*
284 * Note that we allow (in contravention of a strict interpretation of the
285 * relevant RFCs) domain parts other than the last may begin with a digit
286 * (for example, "3com.com"). It's important to disallow an initial digit in
287 * the last part; it's the only thing that stops an IPv4 numeric address
288 * like 127.0.0.1 from looking like a valid domain name.
289 */
290
291 if (isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) {
292 return false;
293 }
294
295 return true;
296 }
297
298 /**
299 * Returns the domain name, normalized to all lower case.
300 */
301 public String name() {
302 return name;
303 }
304
305 /**
306 * Returns the individual components of this domain name, normalized to all
307 * lower case. For example, for the domain name {@code mail.google.com}, this
308 * method returns the list {@code ["mail", "google", "com"]}.
309 */
310 public ImmutableList<String> parts() {
311 return parts;
312 }
313
314 /**
315 * Indicates whether this domain name represents a <i>public suffix</i>, as
316 * defined by the Mozilla Foundation's
317 * <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public
318 * suffix is one under which Internet users can directly register names, such
319 * as {@code com}, {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain
320 * names that are <i>not</i> public suffixes include {@code google}, {@code
321 * google.com} and {@code foo.co.uk}.
322 *
323 * @return {@code true} if this domain name appears exactly on the public
324 * suffix list
325 * @since 6
326 */
327 public boolean isPublicSuffix() {
328 return publicSuffixIndex == 0;
329 }
330
331 /**
332 * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
333 * public suffix}, including if it is a public suffix itself. For example,
334 * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
335 * {@code com}, but not for {@code google} or {@code google.foo}. This is
336 * the recommended method for determining whether a domain is potentially an
337 * addressable host.
338 *
339 * @since 6
340 */
341 public boolean hasPublicSuffix() {
342 return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND;
343 }
344
345 /**
346 * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the
347 * domain name, or {@code null} if no public suffix is present.
348 *
349 * @since 6
350 */
351 public InternetDomainName publicSuffix() {
352 return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null;
353 }
354
355 /**
356 * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
357 * public suffix}, while not being a public suffix itself. For example,
358 * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
359 * {@code bar.ca.us}, but not for {@code google}, {@code com}, or {@code
360 * google.foo}.
361 *
362 * <p><b>Warning:</b> a {@code false} result from this method does not imply
363 * that the domain does not represent an addressable host, as many public
364 * suffixes are also addressable hosts. Use {@link #hasPublicSuffix()} for
365 * that test.
366 *
367 * <p>This method can be used to determine whether it will probably be
368 * possible to set cookies on the domain, though even that depends on
369 * individual browsers' implementations of cookie controls. See
370 * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
371 *
372 * @since 6
373 */
374 public boolean isUnderPublicSuffix() {
375 return publicSuffixIndex > 0;
376 }
377
378 /**
379 * Indicates whether this domain name is composed of exactly one subdomain
380 * component followed by a {@linkplain #isPublicSuffix() public suffix}. For
381 * example, returns {@code true} for {@code google.com} and {@code foo.co.uk},
382 * but not for {@code www.google.com} or {@code co.uk}.
383 *
384 * <p><b>Warning:</b> A {@code true} result from this method does not imply
385 * that the domain is at the highest level which is addressable as a host, as
386 * many public suffixes are also addressable hosts. For example, the domain
387 * {@code bar.uk.com} has a public suffix of {@code uk.com}, so it would
388 * return {@code true} from this method. But {@code uk.com} is itself an
389 * addressable host.
390 *
391 * <p>This method can be used to determine whether a domain is probably the
392 * highest level for which cookies may be set, though even that depends on
393 * individual browsers' implementations of cookie controls. See
394 * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
395 *
396 * @since 6
397 */
398 public boolean isTopPrivateDomain() {
399 return publicSuffixIndex == 1;
400 }
401
402 /**
403 * Returns the portion of this domain name that is one level beneath the
404 * public suffix. For example, for {@code x.adwords.google.co.uk} it returns
405 * {@code google.co.uk}, since {@code co.uk} is a public suffix.
406 *
407 * <p>If {@link #isTopPrivateDomain()} is true, the current domain name
408 * instance is returned.
409 *
410 * <p>This method should not be used to determine the topmost parent domain
411 * which is addressable as a host, as many public suffixes are also
412 * addressable hosts. For example, the domain {@code foo.bar.uk.com} has
413 * a public suffix of {@code uk.com}, so it would return {@code bar.uk.com}
414 * from this method. But {@code uk.com} is itself an addressable host.
415 *
416 * <p>This method can be used to determine the probable highest level parent
417 * domain for which cookies may be set, though even that depends on individual
418 * browsers' implementations of cookie controls.
419 *
420 * @throws IllegalStateException if this domain does not end with a
421 * public suffix
422 * @since 6
423 */
424 public InternetDomainName topPrivateDomain() {
425 if (isTopPrivateDomain()) {
426 return this;
427 }
428 checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name);
429 return ancestor(publicSuffixIndex - 1);
430 }
431
432 /**
433 * Indicates whether this domain is composed of two or more parts.
434 */
435 public boolean hasParent() {
436 return parts.size() > 1;
437 }
438
439 /**
440 * Returns an {@code InternetDomainName} that is the immediate ancestor of
441 * this one; that is, the current domain with the leftmost part removed. For
442 * example, the parent of {@code www.google.com} is {@code google.com}.
443 *
444 * @throws IllegalStateException if the domain has no parent, as determined
445 * by {@link #hasParent}
446 */
447 public InternetDomainName parent() {
448 checkState(hasParent(), "Domain '%s' has no parent", name);
449 return ancestor(1);
450 }
451
452 /**
453 * Returns the ancestor of the current domain at the given number of levels
454 * "higher" (rightward) in the subdomain list. The number of levels must be
455 * non-negative, and less than {@code N-1}, where {@code N} is the number of
456 * parts in the domain.
457 *
458 * <p>TODO: Reasonable candidate for addition to public API.
459 */
460 private InternetDomainName ancestor(int levels) {
461 return fromInternal(DOT_JOINER.join(parts.subList(levels, parts.size())));
462 }
463
464 /**
465 * Creates and returns a new {@code InternetDomainName} by prepending the
466 * argument and a dot to the current name. For example, {@code
467 * InternetDomainName.fromLenient("foo.com").child("www.bar")} returns a new
468 * {@code InternetDomainName} with the value {@code www.bar.foo.com}.
469 *
470 * @throws NullPointerException if leftParts is null
471 * @throws IllegalArgumentException if the resulting name is not valid
472 */
473 public InternetDomainName child(String leftParts) {
474 return fromInternal(checkNotNull(leftParts) + "." + name);
475 }
476
477 /**
478 * Returns a new {@link InternetDomainName} instance with the given {@code
479 * name}, using the same validation as the instance on which it is called.
480 */
481 InternetDomainName fromInternal(String name) {
482 return fromLenient(name);
483 }
484
485 /**
486 * Indicates whether the argument is a syntactically valid domain name after
487 * lenient validation. Specifically, validation against <a
488 * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
489 * ("Internationalizing Domain Names in Applications") is skipped.
490 *
491 * <p>The follow two code snippets are equivalent:
492 *
493 * <pre> {@code
494 *
495 * if (InternetDomainName.isValidLenient(name)) {
496 * domainName = InternetDomainName.fromLenient(name);
497 * } else {
498 * domainName = DEFAULT_DOMAIN;
499 * }}</pre>
500 *
501 * <pre> {@code
502 *
503 * try {
504 * domainName = InternetDomainName.fromLenient(name);
505 * } catch (IllegalArgumentException e) {
506 * domainName = DEFAULT_DOMAIN;
507 * }}</pre>
508 *
509 * @since 8 (previously named {@code isValid})
510 */
511 public static boolean isValidLenient(String name) {
512 try {
513 fromLenient(name);
514 return true;
515 } catch (IllegalArgumentException e) {
516 return false;
517 }
518 }
519
520 /**
521 * Does the domain name match one of the "wildcard" patterns (e.g.
522 * {@code "*.ar"})?
523 */
524 private static boolean matchesWildcardPublicSuffix(String domain) {
525 final String[] pieces = domain.split(DOT_REGEX, 2);
526 return pieces.length == 2 && TldPatterns.UNDER.contains(pieces[1]);
527 }
528
529 // TODO: specify this to return the same as name(); remove name()
530 @Override
531 public String toString() {
532 return Objects.toStringHelper(this).add("name", name).toString();
533 }
534
535 @Override
536 public boolean equals(@Nullable Object object) {
537 if (object == this) {
538 return true;
539 }
540
541 if (object instanceof InternetDomainName) {
542 InternetDomainName that = (InternetDomainName) object;
543 return this.name.equals(that.name);
544 }
545
546 return false;
547 }
548
549 @Override
550 public int hashCode() {
551 return name.hashCode();
552 }
553 }