001 /*
002 * Copyright (C) 2009 Google Inc.
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016
017 package com.google.common.net;
018
019 import static com.google.common.base.Preconditions.checkArgument;
020 import static com.google.common.base.Preconditions.checkNotNull;
021 import static com.google.common.base.Preconditions.checkState;
022
023 import com.google.common.annotations.Beta;
024 import com.google.common.annotations.GwtCompatible;
025 import com.google.common.base.CharMatcher;
026 import com.google.common.base.Joiner;
027 import com.google.common.base.Objects;
028 import com.google.common.base.Splitter;
029 import com.google.common.collect.ImmutableList;
030
031 import java.util.List;
032
033 import javax.annotation.Nullable;
034
035 /**
036 * An immutable well-formed internet domain name, as defined by
037 * <a href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a>, with the
038 * exception that names ending in {@code "."} are not supported (as they are not
039 * generally used in browsers, email, and other end-user applications. Examples
040 * include {@code com} and {@code foo.co.uk}. Only syntactic analysis is
041 * performed; no DNS lookups or other network interactions take place. Thus
042 * there is no guarantee that the domain actually exists on the internet.
043 * Invalid domain names throw {@link IllegalArgumentException} on construction.
044 *
045 * <p>It is often the case that domains of interest are those under a
046 * {@linkplain #isPublicSuffix() public suffix} but not themselves a public
047 * suffix; {@link #hasPublicSuffix()} and {@link #isTopPrivateDomain()} test for
048 * this. Similarly, one often needs to obtain the domain consisting of the
049 * public suffix plus one subdomain level, typically to obtain the highest-level
050 * domain for which cookies may be set. Use {@link #topPrivateDomain()} for this
051 * purpose.
052 *
053 * <p>{@linkplain #equals(Object) Equality} of domain names is case-insensitive,
054 * so for convenience, the {@link #name()} and {@link #parts()} methods return
055 * the lowercase form of the name.
056 *
057 * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name">
058 * internationalized domain names (IDN)</a> such as {@code 网络.cn} are
059 * supported.
060 *
061 * @author Craig Berry
062 * @since 5
063 */
064 @Beta
065 @GwtCompatible
066 public final class InternetDomainName {
067 private static final Splitter DOT_SPLITTER = Splitter.on('.');
068 private static final Joiner DOT_JOINER = Joiner.on('.');
069
070 /**
071 * Value of {@link #publicSuffixIndex} which indicates that no public suffix
072 * was found.
073 */
074 private static final int NO_PUBLIC_SUFFIX_FOUND = -1;
075
076 private static final String DOT_REGEX = "\\.";
077
078 /**
079 * The full domain name, converted to lower case.
080 */
081 private final String name;
082
083 /**
084 * The parts of the domain name, converted to lower case.
085 */
086 private final ImmutableList<String> parts;
087
088 /**
089 * The index in the {@link #parts()} list at which the public suffix begins.
090 * For example, for the domain name {@code www.google.co.uk}, the value would
091 * be 2 (the index of the {@code co} part). The value is negative
092 * (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was
093 * found.
094 */
095 private final int publicSuffixIndex;
096
097 /**
098 * Private constructor used to implement {@link #from(String)}.
099 */
100 private InternetDomainName(String name) {
101 this.name = name;
102 this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name));
103 checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name);
104 this.publicSuffixIndex = findPublicSuffix();
105 }
106
107 /**
108 * Private constructor used to implement {@link #ancestor(int)}. Argument
109 * parts are assumed to be valid, as they always come from an existing domain.
110 */
111 private InternetDomainName(List<String> parts) {
112 checkArgument(!parts.isEmpty());
113
114 this.parts = ImmutableList.copyOf(parts);
115 this.name = DOT_JOINER.join(parts);
116 this.publicSuffixIndex = findPublicSuffix();
117 }
118
119 /**
120 * Returns the index of the leftmost part of the public suffix, or -1 if not
121 * found.
122 */
123 private int findPublicSuffix() {
124 final int partsSize = parts.size();
125
126 for (int i = 0; i < partsSize; i++) {
127 String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize));
128
129 if (isPublicSuffixInternal(ancestorName)) {
130 return i;
131 }
132 }
133
134 return NO_PUBLIC_SUFFIX_FOUND;
135 }
136
137 /**
138 * A factory method for creating {@code InternetDomainName} objects.
139 *
140 * @param domain A domain name (not IP address)
141 * @throws IllegalArgumentException If name is not syntactically valid
142 */
143 public static InternetDomainName from(String domain) {
144 // RFC 1035 defines domain names to be case-insensitive; normalizing
145 // to lower case allows us to simplify matching.
146 return new InternetDomainName(domain.toLowerCase());
147 }
148
149 // TODO: For the moment, we validate that all parts of a domain
150 // * Start and end with an alphanumeric character
151 // * Have alphanumeric, dash, or underscore characters internally
152 // An additional constraint is that the first character of the last part
153 // may not be numeric.
154 // All of this is a compromise to allow relatively accurate and efficient
155 // checking. We may soon move to using java.net.IDN for this purpose in
156 // non-GWT code.
157
158 /**
159 * Validation method used by {@from} to ensure that the domain name is
160 * syntactically valid according to RFC 1035.
161 *
162 * @return Is the domain name syntactically valid?
163 */
164 private static boolean validateSyntax(List<String> parts) {
165 final int lastIndex = parts.size() - 1;
166
167 // Validate the last part specially, as it has different syntax rules.
168
169 if (!validatePart(parts.get(lastIndex), true)) {
170 return false;
171 }
172
173 for (int i = 0; i < lastIndex; i++) {
174 String part = parts.get(i);
175 if (!validatePart(part, false)) {
176 return false;
177 }
178 }
179
180 return true;
181 }
182
183 /**
184 * The maximum size of a single part of a domain name.
185 */
186 private static final int MAX_DOMAIN_PART_LENGTH = 63;
187
188 private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_");
189
190 private static final CharMatcher PART_CHAR_MATCHER =
191 CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER);
192
193 /**
194 * Helper method for {@link #validateSyntax(List)}. Validates that one part of
195 * a domain name is valid.
196 *
197 * @param part The domain name part to be validated
198 * @param isFinalPart Is this the final (rightmost) domain part?
199 * @return Whether the part is valid
200 */
201 private static boolean validatePart(String part, boolean isFinalPart) {
202
203 // These tests could be collapsed into one big boolean expression, but
204 // they have been left as independent tests for clarity.
205
206 if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) {
207 return false;
208 }
209
210 // GWT claims to support java.lang.Character's char-classification
211 // methods, but it actually only works for ASCII. So for now,
212 // assume anything with non-ASCII characters is valid.
213 // The only place this seems to be documented is here:
214 // http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html
215
216 if (!CharMatcher.ASCII.matchesAllOf(part)) {
217 return true;
218 }
219
220 if (!PART_CHAR_MATCHER.matchesAllOf(part)) {
221 return false;
222 }
223
224 if (DASH_MATCHER.matches(part.charAt(0))
225 || DASH_MATCHER.matches(part.charAt(part.length() - 1))) {
226 return false;
227 }
228
229 if (isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) {
230 return false;
231 }
232
233 return true;
234 }
235
236 /**
237 * Returns the domain name, normalized to all lower case.
238 */
239 public String name() {
240 return name;
241 }
242
243 /**
244 * Returns the individual components of this domain name, normalized to all
245 * lower case. For example, for the domain name {@code mail.google.com}, this
246 * method returns the list {@code ["mail", "google", "com"]}.
247 */
248 public ImmutableList<String> parts() {
249 return parts;
250 }
251
252 /**
253 * Old location of {@link #isPublicSuffix()}.
254 *
255 * @deprecated use {@link #isPublicSuffix()}
256 */
257 @Deprecated public boolean isRecognizedTld() {
258 return isPublicSuffix();
259 }
260
261 /**
262 * Old location of {@link #isUnderPublicSuffix()}.
263 *
264 * @deprecated use {@link #isUnderPublicSuffix()}
265 */
266 @Deprecated public boolean isUnderRecognizedTld() {
267 return isUnderPublicSuffix();
268 }
269
270 /**
271 * Old location of {@link #hasPublicSuffix()}.
272 *
273 * @deprecated use {@link #hasPublicSuffix()}
274 */
275 @Deprecated public boolean hasRecognizedTld() {
276 return hasPublicSuffix();
277 }
278
279 /**
280 * Old location of {@link #publicSuffix()}.
281 *
282 * @deprecated use {@link #publicSuffix()}
283 */
284 @Deprecated public InternetDomainName recognizedTld() {
285 return publicSuffix();
286 }
287
288 /**
289 * Old location of {@link #isTopPrivateDomain()}.
290 *
291 * @deprecated use {@link #isTopPrivateDomain()}
292 */
293 @Deprecated public boolean isImmediatelyUnderTld() {
294 return isTopPrivateDomain();
295 }
296
297 /**
298 * Old location of {@link #topPrivateDomain()}.
299 *
300 * @deprecated use {@link #topPrivateDomain()}
301 */
302 @Deprecated public InternetDomainName topCookieDomain() {
303 return topPrivateDomain();
304 }
305
306 /**
307 * Returns the rightmost non-{@linkplain #isRecognizedTld() TLD} domain name
308 * part. For example
309 * {@code new InternetDomainName("www.google.com").rightmostNonTldPart()}
310 * returns {@code "google"}. Returns null if either no
311 * {@linkplain #isRecognizedTld() TLD} is found, or the whole domain name is
312 * itself a {@linkplain #isRecognizedTld() TLD}.
313 *
314 * @deprecated use the first {@linkplain #parts part} of the {@link
315 * #topPrivateDomain()}
316 */
317 @Deprecated public String rightmostNonTldPart() {
318 return publicSuffixIndex >= 1
319 ? parts.get(publicSuffixIndex - 1)
320 : null;
321 }
322
323 /**
324 * Indicates whether this domain name represents a <i>public suffix</i>, as
325 * defined by the Mozilla Foundation's
326 * <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public
327 * suffix is one under which Internet users can directly register names, such
328 * as {@code com}, {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain
329 * names that are <i>not</i> public suffixes include {@code google}, {@code
330 * google.com} and {@code foo.co.uk}.
331 *
332 * @return {@code true} if this domain name appears exactly on the public
333 * suffix list
334 * @since 6
335 */
336 public boolean isPublicSuffix() {
337 return publicSuffixIndex == 0;
338 }
339
340 /**
341 * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
342 * public suffix}, including if it is a public suffix itself. For example,
343 * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
344 * {@code com}, but not for {@code google} or {@code google.foo}.
345 *
346 * @since 6
347 */
348 public boolean hasPublicSuffix() {
349 return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND;
350 }
351
352 /**
353 * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the
354 * domain name, or {@code null} if no public suffix is present.
355 *
356 * @since 6
357 */
358 public InternetDomainName publicSuffix() {
359 return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null;
360 }
361
362 /**
363 * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
364 * public suffix}, while not being a public suffix itself. For example,
365 * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
366 * {@code bar.ca.us}, but not for {@code google}, {@code com}, or {@code
367 * google.foo}.
368 *
369 * @since 6
370 */
371 public boolean isUnderPublicSuffix() {
372 return publicSuffixIndex > 0;
373 }
374
375 /**
376 * Indicates whether this domain name is composed of exactly one subdomain
377 * component followed by a {@linkplain #isPublicSuffix() public suffix}. For
378 * example, returns {@code true} for {@code google.com} and {@code foo.co.uk},
379 * but not for {@code www.google.com} or {@code co.uk}.
380 *
381 * @since 6
382 */
383 public boolean isTopPrivateDomain() {
384 return publicSuffixIndex == 1;
385 }
386
387 /**
388 * Returns the portion of this domain name that is one level beneath the
389 * public suffix. For example, for {@code x.adwords.google.co.uk} it returns
390 * {@code google.co.uk}, since {@code co.uk} is a public suffix. This is the
391 * highest-level parent of this domain for which cookies may be set, as
392 * cookies cannot be set on a public suffix itself.
393 *
394 * <p>If {@link #isTopPrivateDomain()} is true, the current domain name
395 * instance is returned.
396 *
397 * @throws IllegalStateException if this domain does not end with a
398 * public suffix
399 * @since 6
400 */
401 public InternetDomainName topPrivateDomain() {
402 if (isTopPrivateDomain()) {
403 return this;
404 }
405 checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name);
406 return ancestor(publicSuffixIndex - 1);
407 }
408
409 /**
410 * Indicates whether this domain is composed of two or more parts.
411 */
412 public boolean hasParent() {
413 return parts.size() > 1;
414 }
415
416 /**
417 * Returns an {@code InternetDomainName} that is the immediate ancestor of
418 * this one; that is, the current domain with the leftmost part removed. For
419 * example, the parent of {@code www.google.com} is {@code google.com}.
420 *
421 * @throws IllegalStateException if the domain has no parent, as determined
422 * by {@link #hasParent}
423 */
424 public InternetDomainName parent() {
425 checkState(hasParent(), "Domain '%s' has no parent", name);
426 return ancestor(1);
427 }
428
429 /**
430 * Returns the ancestor of the current domain at the given number of levels
431 * "higher" (rightward) in the subdomain list. The number of levels must be
432 * non-negative, and less than {@code N-1}, where {@code N} is the number of
433 * parts in the domain.
434 *
435 * <p>TODO: Reasonable candidate for addition to public API.
436 */
437 private InternetDomainName ancestor(int levels) {
438 return new InternetDomainName(parts.subList(levels, parts.size()));
439 }
440
441 /**
442 * Creates and returns a new {@code InternetDomainName} by prepending the
443 * argument and a dot to the current name. For example, {@code
444 * InternetDomainName.from("foo.com").child("www.bar")} returns a new {@code
445 * InternetDomainName} with the value {@code www.bar.foo.com}.
446 *
447 * @throws NullPointerException if leftParts is null
448 * @throws IllegalArgumentException if the resulting name is not valid
449 */
450 public InternetDomainName child(String leftParts) {
451 return InternetDomainName.from(checkNotNull(leftParts) + "." + name);
452 }
453
454 /**
455 * Indicates whether the argument is a syntactically valid domain name. This
456 * method is intended for the case where a {@link String} must be validated as
457 * a valid domain name, but no further work with that {@link String} as an
458 * {@link InternetDomainName} will be required. Code like the following will
459 * unnecessarily repeat the work of validation: <pre> {@code
460 *
461 * if (InternetDomainName.isValid(name)) {
462 * domainName = InternetDomainName.from(name);
463 * } else {
464 * domainName = DEFAULT_DOMAIN;
465 * }}</pre>
466 *
467 * Such code could instead be written as follows: <pre> {@code
468 *
469 * try {
470 * domainName = InternetDomainName.from(name);
471 * } catch (IllegalArgumentException e) {
472 * domainName = DEFAULT_DOMAIN;
473 * }}</pre>
474 */
475 public static boolean isValid(String name) {
476 try {
477 from(name);
478 return true;
479 } catch (IllegalArgumentException e) {
480 return false;
481 }
482 }
483
484 /**
485 * Does the domain name satisfy the Mozilla criteria for a {@linkplain
486 * #isPublicSuffix() public suffix}?
487 */
488 private static boolean isPublicSuffixInternal(String domain) {
489 return TldPatterns.EXACT.contains(domain)
490 || (!TldPatterns.EXCLUDED.contains(domain)
491 && matchesWildcardPublicSuffix(domain));
492 }
493
494 /**
495 * Does the domain name match one of the "wildcard" patterns (e.g. "*.ar")?
496 */
497 private static boolean matchesWildcardPublicSuffix(String domain) {
498 final String[] pieces = domain.split(DOT_REGEX, 2);
499 return pieces.length == 2 && TldPatterns.UNDER.contains(pieces[1]);
500 }
501
502 // TODO: specify this to return the same as name(); remove name()
503 @Override
504 public String toString() {
505 return Objects.toStringHelper(this).add("name", name).toString();
506 }
507
508 @Override
509 public boolean equals(@Nullable Object object) {
510 if (object == this) {
511 return true;
512 }
513
514 if (object instanceof InternetDomainName) {
515 InternetDomainName that = (InternetDomainName) object;
516 return this.name.equals(that.name);
517 }
518
519 return false;
520 }
521
522 @Override
523 public int hashCode() {
524 return name.hashCode();
525 }
526 }