001 /* 002 * Copyright (C) 2009 Google Inc. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package com.google.common.net; 018 019 import static com.google.common.base.Preconditions.checkArgument; 020 import static com.google.common.base.Preconditions.checkNotNull; 021 import static com.google.common.base.Preconditions.checkState; 022 023 import com.google.common.annotations.Beta; 024 import com.google.common.annotations.GwtCompatible; 025 import com.google.common.base.Ascii; 026 import com.google.common.base.CharMatcher; 027 import com.google.common.base.Joiner; 028 import com.google.common.base.Objects; 029 import com.google.common.base.Splitter; 030 import com.google.common.collect.ImmutableList; 031 032 import java.util.List; 033 034 import javax.annotation.Nullable; 035 036 /** 037 * An immutable well-formed internet domain name, as defined by 038 * <a href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a>. 039 * Examples include {@code com} and {@code foo.co.uk}. Only syntactic analysis 040 * is performed; no DNS lookups or other network interactions take place. Thus 041 * there is no guarantee that the domain actually exists on the internet. 042 * Invalid domain names throw {@link IllegalArgumentException} on construction. 043 * 044 * <p>One common use of this class is to determine whether a given string is 045 * likely to represent an addressable domain on the web -- that is, for a 046 * candidate string "xxx", might browsing to "http://xxx/" result in a webpage 047 * being displayed? In the past, this test was frequently done by determining 048 * whether the domain ended with a {@linkplain #isPublicSuffix() public suffix} 049 * but was not itself a public suffix. However, this test is no longer accurate; 050 * there are many domains which are both public suffixes and addressable as 051 * hosts. "uk.com" is one example. As a result, the only useful test to 052 * determine if a domain is a plausible web host is {@link #hasPublicSuffix()}. 053 * This will return {@code true} for many domains which (currently) are not 054 * hosts, such as "com"), but given that any public suffix may become 055 * a host without warning, it is better to err on the side of permissiveness 056 * and thus avoid spurious rejection of valid sites. 057 * 058 * <p>{@linkplain #equals(Object) Equality} of domain names is case-insensitive 059 * with respect to ASCII characters, so for convenience, the {@link #name()} and 060 * {@link #parts()} methods return string with all ASCII characters converted to 061 * lowercase. 062 * 063 * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name"> 064 * internationalized domain names</a> such as {@code 网络.cn} are 065 * supported, but with much weaker syntactic validation (resulting in false 066 * positive reports of validity). 067 * 068 * @author Craig Berry 069 * @since 5 070 */ 071 @Beta 072 @GwtCompatible(emulated = true) 073 public final class InternetDomainName { 074 075 private static final CharMatcher DOTS_MATCHER = 076 CharMatcher.anyOf(".\u3002\uFF0E\uFF61"); 077 private static final Splitter DOT_SPLITTER = Splitter.on('.'); 078 private static final Joiner DOT_JOINER = Joiner.on('.'); 079 080 /** 081 * Value of {@link #publicSuffixIndex} which indicates that no public suffix 082 * was found. 083 */ 084 private static final int NO_PUBLIC_SUFFIX_FOUND = -1; 085 086 private static final String DOT_REGEX = "\\."; 087 088 /** 089 * The full domain name, converted to lower case. 090 */ 091 private final String name; 092 093 /** 094 * The parts of the domain name, converted to lower case. 095 */ 096 private final ImmutableList<String> parts; 097 098 /** 099 * The index in the {@link #parts()} list at which the public suffix begins. 100 * For example, for the domain name {@code www.google.co.uk}, the value would 101 * be 2 (the index of the {@code co} part). The value is negative 102 * (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was 103 * found. 104 */ 105 private final int publicSuffixIndex; 106 107 /** 108 * Private constructor used to implement {@link #fromLenient(String)}. 109 */ 110 private InternetDomainName(String name) { 111 // Normalize all dot-like characters to '.', and strip trailing '.'. 112 113 name = DOTS_MATCHER.replaceFrom(name, '.'); 114 115 if (name.endsWith(".")) { 116 name = name.substring(0, name.length() - 1); 117 } 118 119 this.name = name; 120 this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name)); 121 checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name); 122 this.publicSuffixIndex = findPublicSuffix(); 123 } 124 125 /** 126 * Private constructor used to implement {@link #ancestor(int)}. Argument 127 * parts are assumed to be valid, as they always come from an existing domain. 128 */ 129 private InternetDomainName(List<String> parts) { 130 checkArgument(!parts.isEmpty()); 131 132 this.parts = ImmutableList.copyOf(parts); 133 this.name = DOT_JOINER.join(parts); 134 this.publicSuffixIndex = findPublicSuffix(); 135 } 136 137 /** 138 * Returns the index of the leftmost part of the public suffix, or -1 if not 139 * found. Note that the value defined as the "public suffix" may not be a 140 * public suffix according to {@link #isPublicSuffix()} if the domain ends 141 * with an excluded domain pattern such as "nhs.uk". 142 */ 143 private int findPublicSuffix() { 144 final int partsSize = parts.size(); 145 146 for (int i = 0; i < partsSize; i++) { 147 String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize)); 148 149 if (TldPatterns.EXACT.contains(ancestorName)) { 150 return i; 151 } 152 153 // Excluded domains (e.g. !nhs.uk) use the next highest 154 // domain as the effective public suffix (e.g. uk). 155 156 if (TldPatterns.EXCLUDED.contains(ancestorName)) { 157 return i + 1; 158 } 159 160 if (matchesWildcardPublicSuffix(ancestorName)) { 161 return i; 162 } 163 } 164 165 return NO_PUBLIC_SUFFIX_FOUND; 166 } 167 168 /** 169 * A factory method for creating {@code InternetDomainName} objects. Only 170 * lenient validation of the domain is performed. Specifically, 171 * validation against 172 * <a href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a> 173 * ("Internationalizing Domain Names in Applications") is not performed. 174 * 175 * @param domain A domain name (not IP address) 176 * @throws IllegalArgumentException If name is not syntactically valid 177 * @since 8 (previously named {@code from}) 178 */ 179 public static InternetDomainName fromLenient(String domain) { 180 /* 181 * RFC 1035 defines ASCII components of domain names to be case-insensitive; 182 * normalizing ASCII characters to lower case allows us to simplify matching 183 * and support more robust equality testing. 184 */ 185 return new InternetDomainName(Ascii.toLowerCase(checkNotNull(domain))); 186 } 187 188 /** 189 * Validation method used by {@from} to ensure that the domain name is 190 * syntactically valid according to RFC 1035. 191 * 192 * @return Is the domain name syntactically valid? 193 */ 194 private static boolean validateSyntax(List<String> parts) { 195 final int lastIndex = parts.size() - 1; 196 197 // Validate the last part specially, as it has different syntax rules. 198 199 if (!validatePart(parts.get(lastIndex), true)) { 200 return false; 201 } 202 203 for (int i = 0; i < lastIndex; i++) { 204 String part = parts.get(i); 205 if (!validatePart(part, false)) { 206 return false; 207 } 208 } 209 210 return true; 211 } 212 213 /** 214 * The maximum size of a single part of a domain name. 215 */ 216 private static final int MAX_DOMAIN_PART_LENGTH = 63; 217 218 private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_"); 219 220 private static final CharMatcher PART_CHAR_MATCHER = 221 CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER); 222 223 /** 224 * Helper method for {@link #validateSyntax(List)}. Validates that one part of 225 * a domain name is valid. 226 * 227 * @param part The domain name part to be validated 228 * @param isFinalPart Is this the final (rightmost) domain part? 229 * @return Whether the part is valid 230 */ 231 private static boolean validatePart(String part, boolean isFinalPart) { 232 233 // These tests could be collapsed into one big boolean expression, but 234 // they have been left as independent tests for clarity. 235 236 if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) { 237 return false; 238 } 239 240 // GWT claims to support java.lang.Character's char-classification 241 // methods, but it actually only works for ASCII. So for now, 242 // assume anything with non-ASCII characters is valid. 243 // The only place this seems to be documented is here: 244 // http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html 245 246 if (!CharMatcher.ASCII.matchesAllOf(part)) { 247 return true; 248 } 249 250 if (!PART_CHAR_MATCHER.matchesAllOf(part)) { 251 return false; 252 } 253 254 if (DASH_MATCHER.matches(part.charAt(0)) 255 || DASH_MATCHER.matches(part.charAt(part.length() - 1))) { 256 return false; 257 } 258 259 if (isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) { 260 return false; 261 } 262 263 return true; 264 } 265 266 /** 267 * Returns the domain name, normalized to all lower case. 268 */ 269 public String name() { 270 return name; 271 } 272 273 /** 274 * Returns the individual components of this domain name, normalized to all 275 * lower case. For example, for the domain name {@code mail.google.com}, this 276 * method returns the list {@code ["mail", "google", "com"]}. 277 */ 278 public ImmutableList<String> parts() { 279 return parts; 280 } 281 282 /** 283 * Indicates whether this domain name represents a <i>public suffix</i>, as 284 * defined by the Mozilla Foundation's 285 * <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public 286 * suffix is one under which Internet users can directly register names, such 287 * as {@code com}, {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain 288 * names that are <i>not</i> public suffixes include {@code google}, {@code 289 * google.com} and {@code foo.co.uk}. 290 * 291 * @return {@code true} if this domain name appears exactly on the public 292 * suffix list 293 * @since 6 294 */ 295 public boolean isPublicSuffix() { 296 return publicSuffixIndex == 0; 297 } 298 299 /** 300 * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() 301 * public suffix}, including if it is a public suffix itself. For example, 302 * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and 303 * {@code com}, but not for {@code google} or {@code google.foo}. This is 304 * the recommended method for determining whether a domain is potentially an 305 * addressable host. 306 * 307 * @since 6 308 */ 309 public boolean hasPublicSuffix() { 310 return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND; 311 } 312 313 /** 314 * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the 315 * domain name, or {@code null} if no public suffix is present. 316 * 317 * @since 6 318 */ 319 public InternetDomainName publicSuffix() { 320 return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null; 321 } 322 323 /** 324 * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() 325 * public suffix}, while not being a public suffix itself. For example, 326 * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and 327 * {@code bar.ca.us}, but not for {@code google}, {@code com}, or {@code 328 * google.foo}. 329 * 330 * <p><b>Warning:</b> a {@code false} result from this method does not imply 331 * that the domain does not represent an addressable host, as many public 332 * suffixes are also addressable hosts. Use {@link #hasPublicSuffix()} for 333 * that test. 334 * 335 * <p>This method can be used to determine whether it will probably be 336 * possible to set cookies on the domain, though even that depends on 337 * individual browsers' implementations of cookie controls. See 338 * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details. 339 * 340 * @since 6 341 */ 342 public boolean isUnderPublicSuffix() { 343 return publicSuffixIndex > 0; 344 } 345 346 /** 347 * Indicates whether this domain name is composed of exactly one subdomain 348 * component followed by a {@linkplain #isPublicSuffix() public suffix}. For 349 * example, returns {@code true} for {@code google.com} and {@code foo.co.uk}, 350 * but not for {@code www.google.com} or {@code co.uk}. 351 * 352 * <p><b>Warning:</b> A {@code true} result from this method does not imply 353 * that the domain is at the highest level which is addressable as a host, as 354 * many public suffixes are also addressable hosts. For example, the domain 355 * {@code bar.uk.com} has a public suffix of {@code uk.com}, so it would 356 * return {@code true} from this method. But {@code uk.com} is itself an 357 * addressable host. 358 * 359 * <p>This method can be used to determine whether a domain is probably the 360 * highest level for which cookies may be set, though even that depends on 361 * individual browsers' implementations of cookie controls. See 362 * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details. 363 * 364 * @since 6 365 */ 366 public boolean isTopPrivateDomain() { 367 return publicSuffixIndex == 1; 368 } 369 370 /** 371 * Returns the portion of this domain name that is one level beneath the 372 * public suffix. For example, for {@code x.adwords.google.co.uk} it returns 373 * {@code google.co.uk}, since {@code co.uk} is a public suffix. 374 * 375 * <p>If {@link #isTopPrivateDomain()} is true, the current domain name 376 * instance is returned. 377 * 378 * <p>This method should not be used to determine the topmost parent domain 379 * which is addressable as a host, as many public suffixes are also 380 * addressable hosts. For example, the domain {@code foo.bar.uk.com} has 381 * a public suffix of {@code uk.com}, so it would return {@code bar.uk.com} 382 * from this method. But {@code uk.com} is itself an addressable host. 383 * 384 * <p>This method can be used to determine the probable highest level parent 385 * domain for which cookies may be set, though even that depends on individual 386 * browsers' implementations of cookie controls. 387 * 388 * @throws IllegalStateException if this domain does not end with a 389 * public suffix 390 * @since 6 391 */ 392 public InternetDomainName topPrivateDomain() { 393 if (isTopPrivateDomain()) { 394 return this; 395 } 396 checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name); 397 return ancestor(publicSuffixIndex - 1); 398 } 399 400 /** 401 * Indicates whether this domain is composed of two or more parts. 402 */ 403 public boolean hasParent() { 404 return parts.size() > 1; 405 } 406 407 /** 408 * Returns an {@code InternetDomainName} that is the immediate ancestor of 409 * this one; that is, the current domain with the leftmost part removed. For 410 * example, the parent of {@code www.google.com} is {@code google.com}. 411 * 412 * @throws IllegalStateException if the domain has no parent, as determined 413 * by {@link #hasParent} 414 */ 415 public InternetDomainName parent() { 416 checkState(hasParent(), "Domain '%s' has no parent", name); 417 return ancestor(1); 418 } 419 420 /** 421 * Returns the ancestor of the current domain at the given number of levels 422 * "higher" (rightward) in the subdomain list. The number of levels must be 423 * non-negative, and less than {@code N-1}, where {@code N} is the number of 424 * parts in the domain. 425 * 426 * <p>TODO: Reasonable candidate for addition to public API. 427 */ 428 private InternetDomainName ancestor(int levels) { 429 return new InternetDomainName(parts.subList(levels, parts.size())); 430 } 431 432 /** 433 * Creates and returns a new {@code InternetDomainName} by prepending the 434 * argument and a dot to the current name. For example, {@code 435 * InternetDomainName.from("foo.com").child("www.bar")} returns a new {@code 436 * InternetDomainName} with the value {@code www.bar.foo.com}. 437 * 438 * @throws NullPointerException if leftParts is null 439 * @throws IllegalArgumentException if the resulting name is not valid 440 */ 441 public InternetDomainName child(String leftParts) { 442 return InternetDomainName.fromLenient(checkNotNull(leftParts) + "." + name); 443 } 444 445 /** 446 * Indicates whether the argument is a syntactically valid domain name. Only 447 * lenient validation is done, as described in {@link #fromLenient(String)}. 448 * 449 * <p>This method is intended for the case where a {@link String} must be 450 * validated as a valid domain name, but no further work with that 451 * {@link String} as an {@link InternetDomainName} will be required. Code like 452 * the following will unnecessarily repeat the work of validation: 453 * <pre> {@code 454 * 455 * if (InternetDomainName.isValid(name)) { 456 * domainName = InternetDomainName.from(name); 457 * } else { 458 * domainName = DEFAULT_DOMAIN; 459 * }}</pre> 460 * 461 * Such code could instead be written as follows: <pre> {@code 462 * 463 * try { 464 * domainName = InternetDomainName.from(name); 465 * } catch (IllegalArgumentException e) { 466 * domainName = DEFAULT_DOMAIN; 467 * }}</pre> 468 * 469 * @since 8 (previously named {@code isValid}) 470 */ 471 public static boolean isValidLenient(String name) { 472 try { 473 fromLenient(name); 474 return true; 475 } catch (IllegalArgumentException e) { 476 return false; 477 } 478 } 479 480 /** 481 * Does the domain name match one of the "wildcard" patterns (e.g. "*.ar")? 482 */ 483 private static boolean matchesWildcardPublicSuffix(String domain) { 484 final String[] pieces = domain.split(DOT_REGEX, 2); 485 return pieces.length == 2 && TldPatterns.UNDER.contains(pieces[1]); 486 } 487 488 // TODO: specify this to return the same as name(); remove name() 489 @Override 490 public String toString() { 491 return Objects.toStringHelper(this).add("name", name).toString(); 492 } 493 494 @Override 495 public boolean equals(@Nullable Object object) { 496 if (object == this) { 497 return true; 498 } 499 500 if (object instanceof InternetDomainName) { 501 InternetDomainName that = (InternetDomainName) object; 502 return this.name.equals(that.name); 503 } 504 505 return false; 506 } 507 508 @Override 509 public int hashCode() { 510 return name.hashCode(); 511 } 512 513 }