001/* 002 * Copyright (C) 2009 The Guava Authors 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 005 * in compliance with the License. You may obtain a copy of the License at 006 * 007 * http://www.apache.org/licenses/LICENSE-2.0 008 * 009 * Unless required by applicable law or agreed to in writing, software distributed under the License 010 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 011 * or implied. See the License for the specific language governing permissions and limitations under 012 * the License. 013 */ 014 015package com.google.common.net; 016 017import static com.google.common.base.Preconditions.checkArgument; 018import static com.google.common.base.Preconditions.checkNotNull; 019import static com.google.common.base.Preconditions.checkState; 020 021import com.google.common.annotations.Beta; 022import com.google.common.annotations.GwtCompatible; 023import com.google.common.base.Ascii; 024import com.google.common.base.CharMatcher; 025import com.google.common.base.Joiner; 026import com.google.common.base.Splitter; 027import com.google.common.collect.ImmutableList; 028import com.google.thirdparty.publicsuffix.PublicSuffixPatterns; 029import java.util.List; 030import javax.annotation.Nullable; 031 032/** 033 * An immutable well-formed internet domain name, such as {@code com} or {@code 034 * foo.co.uk}. Only syntactic analysis is performed; no DNS lookups or other network interactions 035 * take place. Thus there is no guarantee that the domain actually exists on the internet. 036 * 037 * <p>One common use of this class is to determine whether a given string is likely to represent an 038 * addressable domain on the web -- that is, for a candidate string {@code "xxx"}, might browsing to 039 * {@code "http://xxx/"} result in a webpage being displayed? In the past, this test was frequently 040 * done by determining whether the domain ended with a {@linkplain #isPublicSuffix() public suffix} 041 * but was not itself a public suffix. However, this test is no longer accurate. There are many 042 * domains which are both public suffixes and addressable as hosts; {@code "uk.com"} is one example. 043 * As a result, the only useful test to determine if a domain is a plausible web host is 044 * {@link #hasPublicSuffix()}. This will return {@code true} for many domains which (currently) are 045 * not hosts, such as {@code "com"}, but given that any public suffix may become a host without 046 * warning, it is better to err on the side of permissiveness and thus avoid spurious rejection of 047 * valid sites. 048 * 049 * <p>During construction, names are normalized in two ways: 050 * 051 * <ol> 052 * <li>ASCII uppercase characters are converted to lowercase. 053 * <li>Unicode dot separators other than the ASCII period ({@code '.'}) are converted to the ASCII 054 * period. 055 * </ol> 056 * 057 * <p>The normalized values will be returned from {@link #toString()} and {@link #parts()}, and will 058 * be reflected in the result of {@link #equals(Object)}. 059 * 060 * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name">Internationalized domain 061 * names</a> such as {@code 网络.cn} are supported, as are the equivalent 062 * <a href="http://en.wikipedia.org/wiki/Internationalized_domain_name">IDNA Punycode-encoded</a> 063 * versions. 064 * 065 * @author Craig Berry 066 * @since 5.0 067 */ 068@Beta 069@GwtCompatible 070public final class InternetDomainName { 071 072 private static final CharMatcher DOTS_MATCHER = CharMatcher.anyOf(".\u3002\uFF0E\uFF61"); 073 private static final Splitter DOT_SPLITTER = Splitter.on('.'); 074 private static final Joiner DOT_JOINER = Joiner.on('.'); 075 076 /** 077 * Value of {@link #publicSuffixIndex} which indicates that no public suffix was found. 078 */ 079 private static final int NO_PUBLIC_SUFFIX_FOUND = -1; 080 081 private static final String DOT_REGEX = "\\."; 082 083 /** 084 * Maximum parts (labels) in a domain name. This value arises from the 255-octet limit described 085 * in <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11 with the fact that the 086 * encoding of each part occupies at least two bytes (dot plus label externally, length byte plus 087 * label internally). Thus, if all labels have the minimum size of one byte, 127 of them will fit. 088 */ 089 private static final int MAX_PARTS = 127; 090 091 /** 092 * Maximum length of a full domain name, including separators, and leaving room for the root 093 * label. See <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11. 094 */ 095 private static final int MAX_LENGTH = 253; 096 097 /** 098 * Maximum size of a single part of a domain name. See 099 * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11. 100 */ 101 private static final int MAX_DOMAIN_PART_LENGTH = 63; 102 103 /** 104 * The full domain name, converted to lower case. 105 */ 106 private final String name; 107 108 /** 109 * The parts of the domain name, converted to lower case. 110 */ 111 private final ImmutableList<String> parts; 112 113 /** 114 * The index in the {@link #parts()} list at which the public suffix begins. For example, for the 115 * domain name {@code www.google.co.uk}, the value would be 2 (the index of the {@code co} part). 116 * The value is negative (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was 117 * found. 118 */ 119 private final int publicSuffixIndex; 120 121 /** 122 * Constructor used to implement {@link #from(String)}, and from subclasses. 123 */ 124 InternetDomainName(String name) { 125 // Normalize: 126 // * ASCII characters to lowercase 127 // * All dot-like characters to '.' 128 // * Strip trailing '.' 129 130 name = Ascii.toLowerCase(DOTS_MATCHER.replaceFrom(name, '.')); 131 132 if (name.endsWith(".")) { 133 name = name.substring(0, name.length() - 1); 134 } 135 136 checkArgument(name.length() <= MAX_LENGTH, "Domain name too long: '%s':", name); 137 this.name = name; 138 139 this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name)); 140 checkArgument(parts.size() <= MAX_PARTS, "Domain has too many parts: '%s'", name); 141 checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name); 142 143 this.publicSuffixIndex = findPublicSuffix(); 144 } 145 146 /** 147 * Returns the index of the leftmost part of the public suffix, or -1 if not found. Note that the 148 * value defined as the "public suffix" may not be a public suffix according to 149 * {@link #isPublicSuffix()} if the domain ends with an excluded domain pattern such as 150 * {@code "nhs.uk"}. 151 */ 152 private int findPublicSuffix() { 153 final int partsSize = parts.size(); 154 155 for (int i = 0; i < partsSize; i++) { 156 String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize)); 157 158 if (PublicSuffixPatterns.EXACT.containsKey(ancestorName)) { 159 return i; 160 } 161 162 // Excluded domains (e.g. !nhs.uk) use the next highest 163 // domain as the effective public suffix (e.g. uk). 164 165 if (PublicSuffixPatterns.EXCLUDED.containsKey(ancestorName)) { 166 return i + 1; 167 } 168 169 if (matchesWildcardPublicSuffix(ancestorName)) { 170 return i; 171 } 172 } 173 174 return NO_PUBLIC_SUFFIX_FOUND; 175 } 176 177 /** 178 * Returns an instance of {@link InternetDomainName} after lenient validation. Specifically, 179 * validation against <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a> 180 * ("Internationalizing Domain Names in Applications") is skipped, while validation against 181 * <a href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a> is relaxed in the following ways: 182 * <ul> 183 * <li>Any part containing non-ASCII characters is considered valid. 184 * <li>Underscores ('_') are permitted wherever dashes ('-') are permitted. 185 * <li>Parts other than the final part may start with a digit, as mandated by 186 * <a href="https://tools.ietf.org/html/rfc1123#section-2">RFC 1123</a>. 187 * </ul> 188 * 189 * 190 * @param domain A domain name (not IP address) 191 * @throws IllegalArgumentException if {@code name} is not syntactically valid according to 192 * {@link #isValid} 193 * @since 10.0 (previously named {@code fromLenient}) 194 */ 195 public static InternetDomainName from(String domain) { 196 return new InternetDomainName(checkNotNull(domain)); 197 } 198 199 /** 200 * Validation method used by {@from} to ensure that the domain name is syntactically valid 201 * according to RFC 1035. 202 * 203 * @return Is the domain name syntactically valid? 204 */ 205 private static boolean validateSyntax(List<String> parts) { 206 final int lastIndex = parts.size() - 1; 207 208 // Validate the last part specially, as it has different syntax rules. 209 210 if (!validatePart(parts.get(lastIndex), true)) { 211 return false; 212 } 213 214 for (int i = 0; i < lastIndex; i++) { 215 String part = parts.get(i); 216 if (!validatePart(part, false)) { 217 return false; 218 } 219 } 220 221 return true; 222 } 223 224 private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_"); 225 226 private static final CharMatcher PART_CHAR_MATCHER = 227 CharMatcher.javaLetterOrDigit().or(DASH_MATCHER); 228 229 /** 230 * Helper method for {@link #validateSyntax(List)}. Validates that one part of a domain name is 231 * valid. 232 * 233 * @param part The domain name part to be validated 234 * @param isFinalPart Is this the final (rightmost) domain part? 235 * @return Whether the part is valid 236 */ 237 private static boolean validatePart(String part, boolean isFinalPart) { 238 239 // These tests could be collapsed into one big boolean expression, but 240 // they have been left as independent tests for clarity. 241 242 if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) { 243 return false; 244 } 245 246 /* 247 * GWT claims to support java.lang.Character's char-classification methods, but it actually only 248 * works for ASCII. So for now, assume any non-ASCII characters are valid. The only place this 249 * seems to be documented is here: 250 * http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html 251 * 252 * <p>ASCII characters in the part are expected to be valid per RFC 1035, with underscore also 253 * being allowed due to widespread practice. 254 */ 255 256 String asciiChars = CharMatcher.ascii().retainFrom(part); 257 258 if (!PART_CHAR_MATCHER.matchesAllOf(asciiChars)) { 259 return false; 260 } 261 262 // No initial or final dashes or underscores. 263 264 if (DASH_MATCHER.matches(part.charAt(0)) 265 || DASH_MATCHER.matches(part.charAt(part.length() - 1))) { 266 return false; 267 } 268 269 /* 270 * Note that we allow (in contravention of a strict interpretation of the relevant RFCs) domain 271 * parts other than the last may begin with a digit (for example, "3com.com"). It's important to 272 * disallow an initial digit in the last part; it's the only thing that stops an IPv4 numeric 273 * address like 127.0.0.1 from looking like a valid domain name. 274 */ 275 276 if (isFinalPart && CharMatcher.digit().matches(part.charAt(0))) { 277 return false; 278 } 279 280 return true; 281 } 282 283 /** 284 * Returns the individual components of this domain name, normalized to all lower case. For 285 * example, for the domain name {@code mail.google.com}, this method returns the list 286 * {@code ["mail", "google", "com"]}. 287 */ 288 public ImmutableList<String> parts() { 289 return parts; 290 } 291 292 /** 293 * Indicates whether this domain name represents a <i>public suffix</i>, as defined by the Mozilla 294 * Foundation's <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public suffix 295 * is one under which Internet users can directly register names, such as {@code com}, 296 * {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain names that are <i>not</i> public 297 * suffixes include {@code google}, {@code google.com} and {@code foo.co.uk}. 298 * 299 * @return {@code true} if this domain name appears exactly on the public suffix list 300 * @since 6.0 301 */ 302 public boolean isPublicSuffix() { 303 return publicSuffixIndex == 0; 304 } 305 306 /** 307 * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() public suffix}, 308 * including if it is a public suffix itself. For example, returns {@code true} for 309 * {@code www.google.com}, {@code foo.co.uk} and {@code com}, but not for {@code google} or 310 * {@code google.foo}. This is the recommended method for determining whether a domain is 311 * potentially an addressable host. 312 * 313 * @since 6.0 314 */ 315 public boolean hasPublicSuffix() { 316 return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND; 317 } 318 319 /** 320 * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the domain name, or 321 * {@code null} if no public suffix is present. 322 * 323 * @since 6.0 324 */ 325 public InternetDomainName publicSuffix() { 326 return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null; 327 } 328 329 /** 330 * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() public suffix}, 331 * while not being a public suffix itself. For example, returns {@code true} for 332 * {@code www.google.com}, {@code foo.co.uk} and {@code bar.ca.us}, but not for {@code google}, 333 * {@code com}, or {@code 334 * google.foo}. 335 * 336 * <p><b>Warning:</b> a {@code false} result from this method does not imply that the domain does 337 * not represent an addressable host, as many public suffixes are also addressable hosts. Use 338 * {@link #hasPublicSuffix()} for that test. 339 * 340 * <p>This method can be used to determine whether it will probably be possible to set cookies on 341 * the domain, though even that depends on individual browsers' implementations of cookie 342 * controls. See <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details. 343 * 344 * @since 6.0 345 */ 346 public boolean isUnderPublicSuffix() { 347 return publicSuffixIndex > 0; 348 } 349 350 /** 351 * Indicates whether this domain name is composed of exactly one subdomain component followed by a 352 * {@linkplain #isPublicSuffix() public suffix}. For example, returns {@code true} for 353 * {@code google.com} and {@code foo.co.uk}, but not for {@code www.google.com} or {@code co.uk}. 354 * 355 * <p><b>Warning:</b> A {@code true} result from this method does not imply that the domain is at 356 * the highest level which is addressable as a host, as many public suffixes are also addressable 357 * hosts. For example, the domain {@code bar.uk.com} has a public suffix of {@code uk.com}, so it 358 * would return {@code true} from this method. But {@code uk.com} is itself an addressable host. 359 * 360 * <p>This method can be used to determine whether a domain is probably the highest level for 361 * which cookies may be set, though even that depends on individual browsers' implementations of 362 * cookie controls. See <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details. 363 * 364 * @since 6.0 365 */ 366 public boolean isTopPrivateDomain() { 367 return publicSuffixIndex == 1; 368 } 369 370 /** 371 * Returns the portion of this domain name that is one level beneath the public suffix. For 372 * example, for {@code x.adwords.google.co.uk} it returns {@code google.co.uk}, since 373 * {@code co.uk} is a public suffix. 374 * 375 * <p>If {@link #isTopPrivateDomain()} is true, the current domain name instance is returned. 376 * 377 * <p>This method should not be used to determine the topmost parent domain which is addressable 378 * as a host, as many public suffixes are also addressable hosts. For example, the domain 379 * {@code foo.bar.uk.com} has a public suffix of {@code uk.com}, so it would return 380 * {@code bar.uk.com} from this method. But {@code uk.com} is itself an addressable host. 381 * 382 * <p>This method can be used to determine the probable highest level parent domain for which 383 * cookies may be set, though even that depends on individual browsers' implementations of cookie 384 * controls. 385 * 386 * @throws IllegalStateException if this domain does not end with a public suffix 387 * @since 6.0 388 */ 389 public InternetDomainName topPrivateDomain() { 390 if (isTopPrivateDomain()) { 391 return this; 392 } 393 checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name); 394 return ancestor(publicSuffixIndex - 1); 395 } 396 397 /** 398 * Indicates whether this domain is composed of two or more parts. 399 */ 400 public boolean hasParent() { 401 return parts.size() > 1; 402 } 403 404 /** 405 * Returns an {@code InternetDomainName} that is the immediate ancestor of this one; that is, the 406 * current domain with the leftmost part removed. For example, the parent of 407 * {@code www.google.com} is {@code google.com}. 408 * 409 * @throws IllegalStateException if the domain has no parent, as determined by {@link #hasParent} 410 */ 411 public InternetDomainName parent() { 412 checkState(hasParent(), "Domain '%s' has no parent", name); 413 return ancestor(1); 414 } 415 416 /** 417 * Returns the ancestor of the current domain at the given number of levels "higher" (rightward) 418 * in the subdomain list. The number of levels must be non-negative, and less than {@code N-1}, 419 * where {@code N} is the number of parts in the domain. 420 * 421 * <p>TODO: Reasonable candidate for addition to public API. 422 */ 423 private InternetDomainName ancestor(int levels) { 424 return from(DOT_JOINER.join(parts.subList(levels, parts.size()))); 425 } 426 427 /** 428 * Creates and returns a new {@code InternetDomainName} by prepending the argument and a dot to 429 * the current name. For example, {@code 430 * InternetDomainName.from("foo.com").child("www.bar")} returns a new {@code InternetDomainName} 431 * with the value {@code www.bar.foo.com}. Only lenient validation is performed, as described 432 * {@link #from(String) here}. 433 * 434 * @throws NullPointerException if leftParts is null 435 * @throws IllegalArgumentException if the resulting name is not valid 436 */ 437 public InternetDomainName child(String leftParts) { 438 return from(checkNotNull(leftParts) + "." + name); 439 } 440 441 /** 442 * Indicates whether the argument is a syntactically valid domain name using lenient validation. 443 * Specifically, validation against <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a> 444 * ("Internationalizing Domain Names in Applications") is skipped. 445 * 446 * <p>The following two code snippets are equivalent: 447 * 448 * <pre> {@code 449 * domainName = InternetDomainName.isValid(name) 450 * ? InternetDomainName.from(name) 451 * : DEFAULT_DOMAIN;}</pre> 452 * 453 * <pre> {@code 454 * try { 455 * domainName = InternetDomainName.from(name); 456 * } catch (IllegalArgumentException e) { 457 * domainName = DEFAULT_DOMAIN; 458 * }}</pre> 459 * 460 * @since 8.0 (previously named {@code isValidLenient}) 461 */ 462 public static boolean isValid(String name) { 463 try { 464 from(name); 465 return true; 466 } catch (IllegalArgumentException e) { 467 return false; 468 } 469 } 470 471 /** 472 * Does the domain name match one of the "wildcard" patterns (e.g. {@code "*.ar"})? 473 */ 474 private static boolean matchesWildcardPublicSuffix(String domain) { 475 final String[] pieces = domain.split(DOT_REGEX, 2); 476 return pieces.length == 2 && PublicSuffixPatterns.UNDER.containsKey(pieces[1]); 477 } 478 479 /** 480 * Returns the domain name, normalized to all lower case. 481 */ 482 @Override 483 public String toString() { 484 return name; 485 } 486 487 /** 488 * Equality testing is based on the text supplied by the caller, after normalization as described 489 * in the class documentation. For example, a non-ASCII Unicode domain name and the Punycode 490 * version of the same domain name would not be considered equal. 491 * 492 */ 493 @Override 494 public boolean equals(@Nullable Object object) { 495 if (object == this) { 496 return true; 497 } 498 499 if (object instanceof InternetDomainName) { 500 InternetDomainName that = (InternetDomainName) object; 501 return this.name.equals(that.name); 502 } 503 504 return false; 505 } 506 507 @Override 508 public int hashCode() { 509 return name.hashCode(); 510 } 511}