001 /* 002 * Copyright (C) 2009 Google Inc. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package com.google.common.net; 018 019 import static com.google.common.base.Preconditions.checkArgument; 020 import static com.google.common.base.Preconditions.checkNotNull; 021 import static com.google.common.base.Preconditions.checkState; 022 023 import com.google.common.annotations.Beta; 024 import com.google.common.annotations.GwtCompatible; 025 import com.google.common.base.CharMatcher; 026 import com.google.common.base.Joiner; 027 import com.google.common.base.Objects; 028 import com.google.common.base.Splitter; 029 import com.google.common.collect.ImmutableList; 030 031 import java.util.List; 032 033 import javax.annotation.Nullable; 034 035 /** 036 * An immutable well-formed internet domain name, as defined by 037 * <a href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a>, with the 038 * exception that names ending in {@code "."} are not supported (as they are not 039 * generally used in browsers, email, and other end-user applications. Examples 040 * include {@code com} and {@code foo.co.uk}. Only syntactic analysis is 041 * performed; no DNS lookups or other network interactions take place. Thus 042 * there is no guarantee that the domain actually exists on the internet. 043 * Invalid domain names throw {@link IllegalArgumentException} on construction. 044 * 045 * <p>It is often the case that domains of interest are those under a 046 * {@linkplain #isPublicSuffix() public suffix} but not themselves a public 047 * suffix; {@link #hasPublicSuffix()} and {@link #isTopPrivateDomain()} test for 048 * this. Similarly, one often needs to obtain the domain consisting of the 049 * public suffix plus one subdomain level, typically to obtain the highest-level 050 * domain for which cookies may be set. Use {@link #topPrivateDomain()} for this 051 * purpose. 052 * 053 * <p>{@linkplain #equals(Object) Equality} of domain names is case-insensitive, 054 * so for convenience, the {@link #name()} and {@link #parts()} methods return 055 * the lowercase form of the name. 056 * 057 * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name"> 058 * internationalized domain names (IDN)</a> such as {@code 网络.cn} are 059 * supported. 060 * 061 * @author Craig Berry 062 * @since 5 063 */ 064 @Beta 065 @GwtCompatible 066 public final class InternetDomainName { 067 private static final Splitter DOT_SPLITTER = Splitter.on('.'); 068 private static final Joiner DOT_JOINER = Joiner.on('.'); 069 070 /** 071 * Value of {@link #publicSuffixIndex} which indicates that no public suffix 072 * was found. 073 */ 074 private static final int NO_PUBLIC_SUFFIX_FOUND = -1; 075 076 private static final String DOT_REGEX = "\\."; 077 078 /** 079 * The full domain name, converted to lower case. 080 */ 081 private final String name; 082 083 /** 084 * The parts of the domain name, converted to lower case. 085 */ 086 private final ImmutableList<String> parts; 087 088 /** 089 * The index in the {@link #parts()} list at which the public suffix begins. 090 * For example, for the domain name {@code www.google.co.uk}, the value would 091 * be 2 (the index of the {@code co} part). The value is negative 092 * (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was 093 * found. 094 */ 095 private final int publicSuffixIndex; 096 097 /** 098 * Private constructor used to implement {@link #from(String)}. 099 */ 100 private InternetDomainName(String name) { 101 this.name = name; 102 this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name)); 103 checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name); 104 this.publicSuffixIndex = findPublicSuffix(); 105 } 106 107 /** 108 * Private constructor used to implement {@link #ancestor(int)}. Argument 109 * parts are assumed to be valid, as they always come from an existing domain. 110 */ 111 private InternetDomainName(List<String> parts) { 112 checkArgument(!parts.isEmpty()); 113 114 this.parts = ImmutableList.copyOf(parts); 115 this.name = DOT_JOINER.join(parts); 116 this.publicSuffixIndex = findPublicSuffix(); 117 } 118 119 /** 120 * Returns the index of the leftmost part of the public suffix, or -1 if not 121 * found. 122 */ 123 private int findPublicSuffix() { 124 final int partsSize = parts.size(); 125 126 for (int i = 0; i < partsSize; i++) { 127 String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize)); 128 129 if (isPublicSuffixInternal(ancestorName)) { 130 return i; 131 } 132 } 133 134 return NO_PUBLIC_SUFFIX_FOUND; 135 } 136 137 /** 138 * A factory method for creating {@code InternetDomainName} objects. 139 * 140 * @param domain A domain name (not IP address) 141 * @throws IllegalArgumentException If name is not syntactically valid 142 */ 143 public static InternetDomainName from(String domain) { 144 // RFC 1035 defines domain names to be case-insensitive; normalizing 145 // to lower case allows us to simplify matching. 146 return new InternetDomainName(domain.toLowerCase()); 147 } 148 149 // TODO: For the moment, we validate that all parts of a domain 150 // * Start and end with an alphanumeric character 151 // * Have alphanumeric, dash, or underscore characters internally 152 // An additional constraint is that the first character of the last part 153 // may not be numeric. 154 // All of this is a compromise to allow relatively accurate and efficient 155 // checking. We may soon move to using java.net.IDN for this purpose in 156 // non-GWT code. 157 158 /** 159 * Validation method used by {@from} to ensure that the domain name is 160 * syntactically valid according to RFC 1035. 161 * 162 * @return Is the domain name syntactically valid? 163 */ 164 private static boolean validateSyntax(List<String> parts) { 165 final int lastIndex = parts.size() - 1; 166 167 // Validate the last part specially, as it has different syntax rules. 168 169 if (!validatePart(parts.get(lastIndex), true)) { 170 return false; 171 } 172 173 for (int i = 0; i < lastIndex; i++) { 174 String part = parts.get(i); 175 if (!validatePart(part, false)) { 176 return false; 177 } 178 } 179 180 return true; 181 } 182 183 /** 184 * The maximum size of a single part of a domain name. 185 */ 186 private static final int MAX_DOMAIN_PART_LENGTH = 63; 187 188 private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_"); 189 190 private static final CharMatcher PART_CHAR_MATCHER = 191 CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER); 192 193 /** 194 * Helper method for {@link #validateSyntax(List)}. Validates that one part of 195 * a domain name is valid. 196 * 197 * @param part The domain name part to be validated 198 * @param isFinalPart Is this the final (rightmost) domain part? 199 * @return Whether the part is valid 200 */ 201 private static boolean validatePart(String part, boolean isFinalPart) { 202 203 // These tests could be collapsed into one big boolean expression, but 204 // they have been left as independent tests for clarity. 205 206 if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) { 207 return false; 208 } 209 210 // GWT claims to support java.lang.Character's char-classification 211 // methods, but it actually only works for ASCII. So for now, 212 // assume anything with non-ASCII characters is valid. 213 // The only place this seems to be documented is here: 214 // http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html 215 216 if (!CharMatcher.ASCII.matchesAllOf(part)) { 217 return true; 218 } 219 220 if (!PART_CHAR_MATCHER.matchesAllOf(part)) { 221 return false; 222 } 223 224 if (DASH_MATCHER.matches(part.charAt(0)) 225 || DASH_MATCHER.matches(part.charAt(part.length() - 1))) { 226 return false; 227 } 228 229 if (isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) { 230 return false; 231 } 232 233 return true; 234 } 235 236 /** 237 * Returns the domain name, normalized to all lower case. 238 */ 239 public String name() { 240 return name; 241 } 242 243 /** 244 * Returns the individual components of this domain name, normalized to all 245 * lower case. For example, for the domain name {@code mail.google.com}, this 246 * method returns the list {@code ["mail", "google", "com"]}. 247 */ 248 public ImmutableList<String> parts() { 249 return parts; 250 } 251 252 /** 253 * Old location of {@link #isPublicSuffix()}. 254 * 255 * @deprecated use {@link #isPublicSuffix()} 256 */ 257 @Deprecated public boolean isRecognizedTld() { 258 return isPublicSuffix(); 259 } 260 261 /** 262 * Old location of {@link #isUnderPublicSuffix()}. 263 * 264 * @deprecated use {@link #isUnderPublicSuffix()} 265 */ 266 @Deprecated public boolean isUnderRecognizedTld() { 267 return isUnderPublicSuffix(); 268 } 269 270 /** 271 * Old location of {@link #hasPublicSuffix()}. 272 * 273 * @deprecated use {@link #hasPublicSuffix()} 274 */ 275 @Deprecated public boolean hasRecognizedTld() { 276 return hasPublicSuffix(); 277 } 278 279 /** 280 * Old location of {@link #publicSuffix()}. 281 * 282 * @deprecated use {@link #publicSuffix()} 283 */ 284 @Deprecated public InternetDomainName recognizedTld() { 285 return publicSuffix(); 286 } 287 288 /** 289 * Old location of {@link #isTopPrivateDomain()}. 290 * 291 * @deprecated use {@link #isTopPrivateDomain()} 292 */ 293 @Deprecated public boolean isImmediatelyUnderTld() { 294 return isTopPrivateDomain(); 295 } 296 297 /** 298 * Old location of {@link #topPrivateDomain()}. 299 * 300 * @deprecated use {@link #topPrivateDomain()} 301 */ 302 @Deprecated public InternetDomainName topCookieDomain() { 303 return topPrivateDomain(); 304 } 305 306 /** 307 * Returns the rightmost non-{@linkplain #isRecognizedTld() TLD} domain name 308 * part. For example 309 * {@code new InternetDomainName("www.google.com").rightmostNonTldPart()} 310 * returns {@code "google"}. Returns null if either no 311 * {@linkplain #isRecognizedTld() TLD} is found, or the whole domain name is 312 * itself a {@linkplain #isRecognizedTld() TLD}. 313 * 314 * @deprecated use the first {@linkplain #parts part} of the {@link 315 * #topPrivateDomain()} 316 */ 317 @Deprecated public String rightmostNonTldPart() { 318 return publicSuffixIndex >= 1 319 ? parts.get(publicSuffixIndex - 1) 320 : null; 321 } 322 323 /** 324 * Indicates whether this domain name represents a <i>public suffix</i>, as 325 * defined by the Mozilla Foundation's 326 * <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public 327 * suffix is one under which Internet users can directly register names, such 328 * as {@code com}, {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain 329 * names that are <i>not</i> public suffixes include {@code google}, {@code 330 * google.com} and {@code foo.co.uk}. 331 * 332 * @return {@code true} if this domain name appears exactly on the public 333 * suffix list 334 * @since 6 335 */ 336 public boolean isPublicSuffix() { 337 return publicSuffixIndex == 0; 338 } 339 340 /** 341 * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() 342 * public suffix}, including if it is a public suffix itself. For example, 343 * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and 344 * {@code com}, but not for {@code google} or {@code google.foo}. 345 * 346 * @since 6 347 */ 348 public boolean hasPublicSuffix() { 349 return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND; 350 } 351 352 /** 353 * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the 354 * domain name, or {@code null} if no public suffix is present. 355 * 356 * @since 6 357 */ 358 public InternetDomainName publicSuffix() { 359 return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null; 360 } 361 362 /** 363 * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() 364 * public suffix}, while not being a public suffix itself. For example, 365 * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and 366 * {@code bar.ca.us}, but not for {@code google}, {@code com}, or {@code 367 * google.foo}. 368 * 369 * @since 6 370 */ 371 public boolean isUnderPublicSuffix() { 372 return publicSuffixIndex > 0; 373 } 374 375 /** 376 * Indicates whether this domain name is composed of exactly one subdomain 377 * component followed by a {@linkplain #isPublicSuffix() public suffix}. For 378 * example, returns {@code true} for {@code google.com} and {@code foo.co.uk}, 379 * but not for {@code www.google.com} or {@code co.uk}. 380 * 381 * @since 6 382 */ 383 public boolean isTopPrivateDomain() { 384 return publicSuffixIndex == 1; 385 } 386 387 /** 388 * Returns the portion of this domain name that is one level beneath the 389 * public suffix. For example, for {@code x.adwords.google.co.uk} it returns 390 * {@code google.co.uk}, since {@code co.uk} is a public suffix. This is the 391 * highest-level parent of this domain for which cookies may be set, as 392 * cookies cannot be set on a public suffix itself. 393 * 394 * <p>If {@link #isTopPrivateDomain()} is true, the current domain name 395 * instance is returned. 396 * 397 * @throws IllegalStateException if this domain does not end with a 398 * public suffix 399 * @since 6 400 */ 401 public InternetDomainName topPrivateDomain() { 402 if (isTopPrivateDomain()) { 403 return this; 404 } 405 checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name); 406 return ancestor(publicSuffixIndex - 1); 407 } 408 409 /** 410 * Indicates whether this domain is composed of two or more parts. 411 */ 412 public boolean hasParent() { 413 return parts.size() > 1; 414 } 415 416 /** 417 * Returns an {@code InternetDomainName} that is the immediate ancestor of 418 * this one; that is, the current domain with the leftmost part removed. For 419 * example, the parent of {@code www.google.com} is {@code google.com}. 420 * 421 * @throws IllegalStateException if the domain has no parent, as determined 422 * by {@link #hasParent} 423 */ 424 public InternetDomainName parent() { 425 checkState(hasParent(), "Domain '%s' has no parent", name); 426 return ancestor(1); 427 } 428 429 /** 430 * Returns the ancestor of the current domain at the given number of levels 431 * "higher" (rightward) in the subdomain list. The number of levels must be 432 * non-negative, and less than {@code N-1}, where {@code N} is the number of 433 * parts in the domain. 434 * 435 * <p>TODO: Reasonable candidate for addition to public API. 436 */ 437 private InternetDomainName ancestor(int levels) { 438 return new InternetDomainName(parts.subList(levels, parts.size())); 439 } 440 441 /** 442 * Creates and returns a new {@code InternetDomainName} by prepending the 443 * argument and a dot to the current name. For example, {@code 444 * InternetDomainName.from("foo.com").child("www.bar")} returns a new {@code 445 * InternetDomainName} with the value {@code www.bar.foo.com}. 446 * 447 * @throws NullPointerException if leftParts is null 448 * @throws IllegalArgumentException if the resulting name is not valid 449 */ 450 public InternetDomainName child(String leftParts) { 451 return InternetDomainName.from(checkNotNull(leftParts) + "." + name); 452 } 453 454 /** 455 * Indicates whether the argument is a syntactically valid domain name. This 456 * method is intended for the case where a {@link String} must be validated as 457 * a valid domain name, but no further work with that {@link String} as an 458 * {@link InternetDomainName} will be required. Code like the following will 459 * unnecessarily repeat the work of validation: <pre> {@code 460 * 461 * if (InternetDomainName.isValid(name)) { 462 * domainName = InternetDomainName.from(name); 463 * } else { 464 * domainName = DEFAULT_DOMAIN; 465 * }}</pre> 466 * 467 * Such code could instead be written as follows: <pre> {@code 468 * 469 * try { 470 * domainName = InternetDomainName.from(name); 471 * } catch (IllegalArgumentException e) { 472 * domainName = DEFAULT_DOMAIN; 473 * }}</pre> 474 */ 475 public static boolean isValid(String name) { 476 try { 477 from(name); 478 return true; 479 } catch (IllegalArgumentException e) { 480 return false; 481 } 482 } 483 484 /** 485 * Does the domain name satisfy the Mozilla criteria for a {@linkplain 486 * #isPublicSuffix() public suffix}? 487 */ 488 private static boolean isPublicSuffixInternal(String domain) { 489 return TldPatterns.EXACT.contains(domain) 490 || (!TldPatterns.EXCLUDED.contains(domain) 491 && matchesWildcardPublicSuffix(domain)); 492 } 493 494 /** 495 * Does the domain name match one of the "wildcard" patterns (e.g. "*.ar")? 496 */ 497 private static boolean matchesWildcardPublicSuffix(String domain) { 498 final String[] pieces = domain.split(DOT_REGEX, 2); 499 return pieces.length == 2 && TldPatterns.UNDER.contains(pieces[1]); 500 } 501 502 // TODO: specify this to return the same as name(); remove name() 503 @Override 504 public String toString() { 505 return Objects.toStringHelper(this).add("name", name).toString(); 506 } 507 508 @Override 509 public boolean equals(@Nullable Object object) { 510 if (object == this) { 511 return true; 512 } 513 514 if (object instanceof InternetDomainName) { 515 InternetDomainName that = (InternetDomainName) object; 516 return this.name.equals(that.name); 517 } 518 519 return false; 520 } 521 522 @Override 523 public int hashCode() { 524 return name.hashCode(); 525 } 526 }