001/* 002 * Copyright (C) 2009 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017package com.google.common.net; 018 019import static com.google.common.base.Preconditions.checkArgument; 020import static com.google.common.base.Preconditions.checkNotNull; 021import static com.google.common.base.Preconditions.checkState; 022 023import com.google.common.annotations.Beta; 024import com.google.common.annotations.GwtCompatible; 025import com.google.common.base.Ascii; 026import com.google.common.base.CharMatcher; 027import com.google.common.base.Joiner; 028import com.google.common.base.Splitter; 029import com.google.common.collect.ImmutableList; 030 031import java.util.List; 032 033import javax.annotation.Nullable; 034 035/** 036 * An immutable well-formed internet domain name, such as {@code com} or {@code 037 * foo.co.uk}. Only syntactic analysis is performed; no DNS lookups or other 038 * network interactions take place. Thus there is no guarantee that the domain 039 * actually exists on the internet. 040 * 041 * <p>One common use of this class is to determine whether a given string is 042 * likely to represent an addressable domain on the web -- that is, for a 043 * candidate string {@code "xxx"}, might browsing to {@code "http://xxx/"} 044 * result in a webpage being displayed? In the past, this test was frequently 045 * done by determining whether the domain ended with a {@linkplain 046 * #isPublicSuffix() public suffix} but was not itself a public suffix. However, 047 * this test is no longer accurate. There are many domains which are both public 048 * suffixes and addressable as hosts; {@code "uk.com"} is one example. As a 049 * result, the only useful test to determine if a domain is a plausible web host 050 * is {@link #hasPublicSuffix()}. This will return {@code true} for many domains 051 * which (currently) are not hosts, such as {@code "com"}, but given that any 052 * public suffix may become a host without warning, it is better to err on the 053 * side of permissiveness and thus avoid spurious rejection of valid sites. 054 * 055 * <p>During construction, names are normalized in two ways: 056 * <ol> 057 * <li>ASCII uppercase characters are converted to lowercase. 058 * <li>Unicode dot separators other than the ASCII period ({@code '.'}) are 059 * converted to the ASCII period. 060 * </ol> 061 * <p>The normalized values will be returned from {@link #name()} and 062 * {@link #parts()}, and will be reflected in the result of 063 * {@link #equals(Object)}. 064 * 065 * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name"> 066 * Internationalized domain names</a> such as {@code 网络.cn} are supported, as 067 * are the equivalent <a 068 * href="http://en.wikipedia.org/wiki/Internationalized_domain_name">IDNA 069 * Punycode-encoded</a> versions. 070 * 071 * @author Craig Berry 072 * @since 5.0 073 */ 074@Beta 075@GwtCompatible 076public final class InternetDomainName { 077 078 private static final CharMatcher DOTS_MATCHER = 079 CharMatcher.anyOf(".\u3002\uFF0E\uFF61"); 080 private static final Splitter DOT_SPLITTER = Splitter.on('.'); 081 private static final Joiner DOT_JOINER = Joiner.on('.'); 082 083 /** 084 * Value of {@link #publicSuffixIndex} which indicates that no public suffix 085 * was found. 086 */ 087 private static final int NO_PUBLIC_SUFFIX_FOUND = -1; 088 089 private static final String DOT_REGEX = "\\."; 090 091 /** 092 * Maximum parts (labels) in a domain name. This value arises from 093 * the 255-octet limit described in 094 * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11 with 095 * the fact that the encoding of each part occupies at least two bytes 096 * (dot plus label externally, length byte plus label internally). Thus, if 097 * all labels have the minimum size of one byte, 127 of them will fit. 098 */ 099 private static final int MAX_PARTS = 127; 100 101 /** 102 * Maximum length of a full domain name, including separators, and 103 * leaving room for the root label. See 104 * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11. 105 */ 106 private static final int MAX_LENGTH = 253; 107 108 /** 109 * Maximum size of a single part of a domain name. See 110 * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11. 111 */ 112 private static final int MAX_DOMAIN_PART_LENGTH = 63; 113 114 /** 115 * The full domain name, converted to lower case. 116 */ 117 private final String name; 118 119 /** 120 * The parts of the domain name, converted to lower case. 121 */ 122 private final ImmutableList<String> parts; 123 124 /** 125 * The index in the {@link #parts()} list at which the public suffix begins. 126 * For example, for the domain name {@code www.google.co.uk}, the value would 127 * be 2 (the index of the {@code co} part). The value is negative 128 * (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was 129 * found. 130 */ 131 private final int publicSuffixIndex; 132 133 /** 134 * Constructor used to implement {@link #from(String)}, and from subclasses. 135 */ 136 InternetDomainName(String name) { 137 // Normalize: 138 // * ASCII characters to lowercase 139 // * All dot-like characters to '.' 140 // * Strip trailing '.' 141 142 name = Ascii.toLowerCase(DOTS_MATCHER.replaceFrom(name, '.')); 143 144 if (name.endsWith(".")) { 145 name = name.substring(0, name.length() - 1); 146 } 147 148 checkArgument(name.length() <= MAX_LENGTH, 149 "Domain name too long: '%s':", name); 150 this.name = name; 151 152 this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name)); 153 checkArgument(parts.size() <= MAX_PARTS, 154 "Domain has too many parts: '%s'", name); 155 checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name); 156 157 this.publicSuffixIndex = findPublicSuffix(); 158 } 159 160 /** 161 * Returns the index of the leftmost part of the public suffix, or -1 if not 162 * found. Note that the value defined as the "public suffix" may not be a 163 * public suffix according to {@link #isPublicSuffix()} if the domain ends 164 * with an excluded domain pattern such as {@code "nhs.uk"}. 165 */ 166 private int findPublicSuffix() { 167 final int partsSize = parts.size(); 168 169 for (int i = 0; i < partsSize; i++) { 170 String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize)); 171 172 if (TldPatterns.EXACT.contains(ancestorName)) { 173 return i; 174 } 175 176 // Excluded domains (e.g. !nhs.uk) use the next highest 177 // domain as the effective public suffix (e.g. uk). 178 179 if (TldPatterns.EXCLUDED.contains(ancestorName)) { 180 return i + 1; 181 } 182 183 if (matchesWildcardPublicSuffix(ancestorName)) { 184 return i; 185 } 186 } 187 188 return NO_PUBLIC_SUFFIX_FOUND; 189 } 190 191 /** 192 * A deprecated synonym for {@link #from(String)}. 193 * 194 * @param domain A domain name (not IP address) 195 * @throws IllegalArgumentException if {@code name} is not syntactically valid 196 * according to {@link #isValid} 197 * @since 8.0 (previously named {@code from}) 198 * @deprecated Use {@link #from(String)} 199 */ 200 @Deprecated 201 public static InternetDomainName fromLenient(String domain) { 202 return from(domain); 203 } 204 205 /** 206 * Returns an instance of {@link InternetDomainName} after lenient 207 * validation. Specifically, validation against <a 208 * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a> 209 * ("Internationalizing Domain Names in Applications") is skipped, while 210 * validation against <a 211 * href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a> is relaxed in 212 * the following ways: 213 * <ul> 214 * <li>Any part containing non-ASCII characters is considered valid. 215 * <li>Underscores ('_') are permitted wherever dashes ('-') are permitted. 216 * <li>Parts other than the final part may start with a digit. 217 * </ul> 218 * 219 * 220 * @param domain A domain name (not IP address) 221 * @throws IllegalArgumentException if {@code name} is not syntactically valid 222 * according to {@link #isValid} 223 * @since 10.0 (previously named {@code fromLenient}) 224 */ 225 public static InternetDomainName from(String domain) { 226 return new InternetDomainName(checkNotNull(domain)); 227 } 228 229 /** 230 * Validation method used by {@from} to ensure that the domain name is 231 * syntactically valid according to RFC 1035. 232 * 233 * @return Is the domain name syntactically valid? 234 */ 235 private static boolean validateSyntax(List<String> parts) { 236 final int lastIndex = parts.size() - 1; 237 238 // Validate the last part specially, as it has different syntax rules. 239 240 if (!validatePart(parts.get(lastIndex), true)) { 241 return false; 242 } 243 244 for (int i = 0; i < lastIndex; i++) { 245 String part = parts.get(i); 246 if (!validatePart(part, false)) { 247 return false; 248 } 249 } 250 251 return true; 252 } 253 254 private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_"); 255 256 private static final CharMatcher PART_CHAR_MATCHER = 257 CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER); 258 259 /** 260 * Helper method for {@link #validateSyntax(List)}. Validates that one part of 261 * a domain name is valid. 262 * 263 * @param part The domain name part to be validated 264 * @param isFinalPart Is this the final (rightmost) domain part? 265 * @return Whether the part is valid 266 */ 267 private static boolean validatePart(String part, boolean isFinalPart) { 268 269 // These tests could be collapsed into one big boolean expression, but 270 // they have been left as independent tests for clarity. 271 272 if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) { 273 return false; 274 } 275 276 /* 277 * GWT claims to support java.lang.Character's char-classification methods, 278 * but it actually only works for ASCII. So for now, assume any non-ASCII 279 * characters are valid. The only place this seems to be documented is here: 280 * http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html 281 * 282 * <p>ASCII characters in the part are expected to be valid per RFC 1035, 283 * with underscore also being allowed due to widespread practice. 284 */ 285 286 String asciiChars = CharMatcher.ASCII.retainFrom(part); 287 288 if (!PART_CHAR_MATCHER.matchesAllOf(asciiChars)) { 289 return false; 290 } 291 292 // No initial or final dashes or underscores. 293 294 if (DASH_MATCHER.matches(part.charAt(0)) 295 || DASH_MATCHER.matches(part.charAt(part.length() - 1))) { 296 return false; 297 } 298 299 /* 300 * Note that we allow (in contravention of a strict interpretation of the 301 * relevant RFCs) domain parts other than the last may begin with a digit 302 * (for example, "3com.com"). It's important to disallow an initial digit in 303 * the last part; it's the only thing that stops an IPv4 numeric address 304 * like 127.0.0.1 from looking like a valid domain name. 305 */ 306 307 if (isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) { 308 return false; 309 } 310 311 return true; 312 } 313 314 /** 315 * A deprecated synonym for {@link #toString()}. 316 * 317 * @deprecated Use {@link #toString()} 318 */ 319 @Deprecated 320 public String name() { 321 return toString(); 322 } 323 324 /** 325 * Returns the individual components of this domain name, normalized to all 326 * lower case. For example, for the domain name {@code mail.google.com}, this 327 * method returns the list {@code ["mail", "google", "com"]}. 328 */ 329 public ImmutableList<String> parts() { 330 return parts; 331 } 332 333 /** 334 * Indicates whether this domain name represents a <i>public suffix</i>, as 335 * defined by the Mozilla Foundation's 336 * <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public 337 * suffix is one under which Internet users can directly register names, such 338 * as {@code com}, {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain 339 * names that are <i>not</i> public suffixes include {@code google}, {@code 340 * google.com} and {@code foo.co.uk}. 341 * 342 * @return {@code true} if this domain name appears exactly on the public 343 * suffix list 344 * @since 6.0 345 */ 346 public boolean isPublicSuffix() { 347 return publicSuffixIndex == 0; 348 } 349 350 /** 351 * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() 352 * public suffix}, including if it is a public suffix itself. For example, 353 * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and 354 * {@code com}, but not for {@code google} or {@code google.foo}. This is 355 * the recommended method for determining whether a domain is potentially an 356 * addressable host. 357 * 358 * @since 6.0 359 */ 360 public boolean hasPublicSuffix() { 361 return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND; 362 } 363 364 /** 365 * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the 366 * domain name, or {@code null} if no public suffix is present. 367 * 368 * @since 6.0 369 */ 370 public InternetDomainName publicSuffix() { 371 return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null; 372 } 373 374 /** 375 * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() 376 * public suffix}, while not being a public suffix itself. For example, 377 * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and 378 * {@code bar.ca.us}, but not for {@code google}, {@code com}, or {@code 379 * google.foo}. 380 * 381 * <p><b>Warning:</b> a {@code false} result from this method does not imply 382 * that the domain does not represent an addressable host, as many public 383 * suffixes are also addressable hosts. Use {@link #hasPublicSuffix()} for 384 * that test. 385 * 386 * <p>This method can be used to determine whether it will probably be 387 * possible to set cookies on the domain, though even that depends on 388 * individual browsers' implementations of cookie controls. See 389 * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details. 390 * 391 * @since 6.0 392 */ 393 public boolean isUnderPublicSuffix() { 394 return publicSuffixIndex > 0; 395 } 396 397 /** 398 * Indicates whether this domain name is composed of exactly one subdomain 399 * component followed by a {@linkplain #isPublicSuffix() public suffix}. For 400 * example, returns {@code true} for {@code google.com} and {@code foo.co.uk}, 401 * but not for {@code www.google.com} or {@code co.uk}. 402 * 403 * <p><b>Warning:</b> A {@code true} result from this method does not imply 404 * that the domain is at the highest level which is addressable as a host, as 405 * many public suffixes are also addressable hosts. For example, the domain 406 * {@code bar.uk.com} has a public suffix of {@code uk.com}, so it would 407 * return {@code true} from this method. But {@code uk.com} is itself an 408 * addressable host. 409 * 410 * <p>This method can be used to determine whether a domain is probably the 411 * highest level for which cookies may be set, though even that depends on 412 * individual browsers' implementations of cookie controls. See 413 * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details. 414 * 415 * @since 6.0 416 */ 417 public boolean isTopPrivateDomain() { 418 return publicSuffixIndex == 1; 419 } 420 421 /** 422 * Returns the portion of this domain name that is one level beneath the 423 * public suffix. For example, for {@code x.adwords.google.co.uk} it returns 424 * {@code google.co.uk}, since {@code co.uk} is a public suffix. 425 * 426 * <p>If {@link #isTopPrivateDomain()} is true, the current domain name 427 * instance is returned. 428 * 429 * <p>This method should not be used to determine the topmost parent domain 430 * which is addressable as a host, as many public suffixes are also 431 * addressable hosts. For example, the domain {@code foo.bar.uk.com} has 432 * a public suffix of {@code uk.com}, so it would return {@code bar.uk.com} 433 * from this method. But {@code uk.com} is itself an addressable host. 434 * 435 * <p>This method can be used to determine the probable highest level parent 436 * domain for which cookies may be set, though even that depends on individual 437 * browsers' implementations of cookie controls. 438 * 439 * @throws IllegalStateException if this domain does not end with a 440 * public suffix 441 * @since 6.0 442 */ 443 public InternetDomainName topPrivateDomain() { 444 if (isTopPrivateDomain()) { 445 return this; 446 } 447 checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name); 448 return ancestor(publicSuffixIndex - 1); 449 } 450 451 /** 452 * Indicates whether this domain is composed of two or more parts. 453 */ 454 public boolean hasParent() { 455 return parts.size() > 1; 456 } 457 458 /** 459 * Returns an {@code InternetDomainName} that is the immediate ancestor of 460 * this one; that is, the current domain with the leftmost part removed. For 461 * example, the parent of {@code www.google.com} is {@code google.com}. 462 * 463 * @throws IllegalStateException if the domain has no parent, as determined 464 * by {@link #hasParent} 465 */ 466 public InternetDomainName parent() { 467 checkState(hasParent(), "Domain '%s' has no parent", name); 468 return ancestor(1); 469 } 470 471 /** 472 * Returns the ancestor of the current domain at the given number of levels 473 * "higher" (rightward) in the subdomain list. The number of levels must be 474 * non-negative, and less than {@code N-1}, where {@code N} is the number of 475 * parts in the domain. 476 * 477 * <p>TODO: Reasonable candidate for addition to public API. 478 */ 479 private InternetDomainName ancestor(int levels) { 480 return from(DOT_JOINER.join(parts.subList(levels, parts.size()))); 481 } 482 483 /** 484 * Creates and returns a new {@code InternetDomainName} by prepending the 485 * argument and a dot to the current name. For example, {@code 486 * InternetDomainName.from("foo.com").child("www.bar")} returns a new 487 * {@code InternetDomainName} with the value {@code www.bar.foo.com}. Only 488 * lenient validation is performed, as described {@link #from(String) here}. 489 * 490 * @throws NullPointerException if leftParts is null 491 * @throws IllegalArgumentException if the resulting name is not valid 492 */ 493 public InternetDomainName child(String leftParts) { 494 return from(checkNotNull(leftParts) + "." + name); 495 } 496 497 /** 498 * Indicates whether the argument is a syntactically valid domain name using 499 * lenient validation. Specifically, validation against <a 500 * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a> 501 * ("Internationalizing Domain Names in Applications") is skipped. 502 * 503 * <p>The following two code snippets are equivalent: 504 * 505 * <pre> {@code 506 * domainName = InternetDomainName.isValid(name) 507 * ? InternetDomainName.from(name) 508 * : DEFAULT_DOMAIN;}</pre> 509 * 510 * <pre> {@code 511 * try { 512 * domainName = InternetDomainName.from(name); 513 * } catch (IllegalArgumentException e) { 514 * domainName = DEFAULT_DOMAIN; 515 * }}</pre> 516 * 517 * @since 8.0 (previously named {@code isValidLenient}) 518 */ 519 public static boolean isValid(String name) { 520 try { 521 from(name); 522 return true; 523 } catch (IllegalArgumentException e) { 524 return false; 525 } 526 } 527 528 /** 529 * Does the domain name match one of the "wildcard" patterns (e.g. 530 * {@code "*.ar"})? 531 */ 532 private static boolean matchesWildcardPublicSuffix(String domain) { 533 final String[] pieces = domain.split(DOT_REGEX, 2); 534 return pieces.length == 2 && TldPatterns.UNDER.contains(pieces[1]); 535 } 536 537 /** 538 * Returns the domain name, normalized to all lower case. 539 */ 540 @Override 541 public String toString() { 542 return name; 543 } 544 545 /** 546 * Equality testing is based on the text supplied by the caller, 547 * after normalization as described in the class documentation. For 548 * example, a non-ASCII Unicode domain name and the Punycode version 549 * of the same domain name would not be considered equal. 550 * 551 */ 552 @Override 553 public boolean equals(@Nullable Object object) { 554 if (object == this) { 555 return true; 556 } 557 558 if (object instanceof InternetDomainName) { 559 InternetDomainName that = (InternetDomainName) object; 560 return this.name.equals(that.name); 561 } 562 563 return false; 564 } 565 566 @Override 567 public int hashCode() { 568 return name.hashCode(); 569 } 570}