[go: nahoru, domu]

1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package android.support.v4.text;
18
19import android.support.v4.view.ViewCompat;
20
21import java.util.Locale;
22
23import static android.support.v4.text.TextDirectionHeuristicsCompat.FIRSTSTRONG_LTR;
24
25/**
26 * Utility class for formatting text for display in a potentially opposite-directionality context
27 * without garbling. The directionality of the context is set at formatter creation and the
28 * directionality of the text can be either estimated or passed in when known. Provides the
29 * following functionality:
30 * <p>
31 * 1. Bidi Wrapping
32 * When text in one language is mixed into a document in another, opposite-directionality language,
33 * e.g. when an English business name is embedded in a Hebrew web page, both the inserted string
34 * and the text surrounding it may be displayed incorrectly unless the inserted string is explicitly
35 * separated from the surrounding text in a "wrapper" that:
36 * <p>
37 * - Declares its directionality so that the string is displayed correctly. This can be done in
38 *   Unicode bidi formatting codes by {@link #unicodeWrap} and similar methods.
39 * <p>
40 * - Isolates the string's directionality, so it does not unduly affect the surrounding content.
41 *   Currently, this can only be done using invisible Unicode characters of the same direction as
42 *   the context (LRM or RLM) in addition to the directionality declaration above, thus "resetting"
43 *   the directionality to that of the context. The "reset" may need to be done at both ends of the
44 *   string. Without "reset" after the string, the string will "stick" to a number or logically
45 *   separate opposite-direction text that happens to follow it in-line (even if separated by
46 *   neutral content like spaces and punctuation). Without "reset" before the string, the same can
47 *   happen there, but only with more opposite-direction text, not a number. One approach is to
48 *   "reset" the direction only after each string, on the theory that if the preceding opposite-
49 *   direction text is itself bidi-wrapped, the "reset" after it will prevent the sticking. (Doing
50 *   the "reset" only before each string definitely does not work because we do not want to require
51 *   bidi-wrapping numbers, and a bidi-wrapped opposite-direction string could be followed by a
52 *   number.) Still, the safest policy is to do the "reset" on both ends of each string, since RTL
53 *   message translations often contain untranslated Latin-script brand names and technical terms,
54 *   and one of these can be followed by a bidi-wrapped inserted value. On the other hand, when one
55 *   has such a message, it is best to do the "reset" manually in the message translation itself,
56 *   since the message's opposite-direction text could be followed by an inserted number, which we
57 *   would not bidi-wrap anyway. Thus, "reset" only after the string is the current default. In an
58 *   alternative to "reset", recent additions to the HTML, CSS, and Unicode standards allow the
59 *   isolation to be part of the directionality declaration. This form of isolation is better than
60 *   "reset" because it takes less space, does not require knowing the context directionality, has a
61 *   gentler effect than "reset", and protects both ends of the string. However, we do not yet allow
62 *   using it because required platforms do not yet support it.
63 * <p>
64 * Providing these wrapping services is the basic purpose of the bidi formatter.
65 * <p>
66 * 2. Directionality estimation
67 * How does one know whether a string about to be inserted into surrounding text has the same
68 * directionality? Well, in many cases, one knows that this must be the case when writing the code
69 * doing the insertion, e.g. when a localized message is inserted into a localized page. In such
70 * cases there is no need to involve the bidi formatter at all. In some other cases, it need not be
71 * the same as the context, but is either constant (e.g. urls are always LTR) or otherwise known.
72 * In the remaining cases, e.g. when the string is user-entered or comes from a database, the
73 * language of the string (and thus its directionality) is not known a priori, and must be
74 * estimated at run-time. The bidi formatter can do this automatically using the default
75 * first-strong estimation algorithm. It can also be configured to use a custom directionality
76 * estimation object.
77 */
78public final class BidiFormatter {
79
80    /**
81     * The default text direction heuristic.
82     */
83    private static TextDirectionHeuristicCompat DEFAULT_TEXT_DIRECTION_HEURISTIC = FIRSTSTRONG_LTR;
84
85    /**
86     * Unicode "Left-To-Right Embedding" (LRE) character.
87     */
88    private static final char LRE = '\u202A';
89
90    /**
91     * Unicode "Right-To-Left Embedding" (RLE) character.
92     */
93    private static final char RLE = '\u202B';
94
95    /**
96     * Unicode "Pop Directional Formatting" (PDF) character.
97     */
98    private static final char PDF = '\u202C';
99
100    /**
101     *  Unicode "Left-To-Right Mark" (LRM) character.
102     */
103    private static final char LRM = '\u200E';
104
105    /*
106     * Unicode "Right-To-Left Mark" (RLM) character.
107     */
108    private static final char RLM = '\u200F';
109
110    /*
111     * String representation of LRM
112     */
113    private static final String LRM_STRING = Character.toString(LRM);
114
115    /*
116     * String representation of RLM
117     */
118    private static final String RLM_STRING = Character.toString(RLM);
119
120    /**
121     * Empty string constant.
122     */
123    private static final String EMPTY_STRING = "";
124
125    /**
126     * A class for building a BidiFormatter with non-default options.
127     */
128    public static final class Builder {
129        private boolean mIsRtlContext;
130        private int mFlags;
131        private TextDirectionHeuristicCompat mTextDirectionHeuristicCompat;
132
133        /**
134         * Constructor.
135         *
136         */
137        public Builder() {
138            initialize(isRtlLocale(Locale.getDefault()));
139        }
140
141        /**
142         * Constructor.
143         *
144         * @param rtlContext Whether the context directionality is RTL.
145         */
146        public Builder(boolean rtlContext) {
147            initialize(rtlContext);
148        }
149
150        /**
151         * Constructor.
152         *
153         * @param locale The context locale.
154         */
155        public Builder(Locale locale) {
156            initialize(isRtlLocale(locale));
157        }
158
159        /**
160         * Initializes the builder with the given context directionality and default options.
161         *
162         * @param isRtlContext Whether the context is RTL or not.
163         */
164        private void initialize(boolean isRtlContext) {
165            mIsRtlContext = isRtlContext;
166            mTextDirectionHeuristicCompat = DEFAULT_TEXT_DIRECTION_HEURISTIC;
167            mFlags = DEFAULT_FLAGS;
168        }
169
170        /**
171         * Specifies whether the BidiFormatter to be built should also "reset" directionality before
172         * a string being bidi-wrapped, not just after it. The default is true.
173         */
174        public Builder stereoReset(boolean stereoReset) {
175            if (stereoReset) {
176                mFlags |= FLAG_STEREO_RESET;
177            } else {
178                mFlags &= ~FLAG_STEREO_RESET;
179            }
180            return this;
181        }
182
183        /**
184         * Specifies the default directionality estimation algorithm to be used by the BidiFormatter.
185         * By default, uses the first-strong heuristic.
186         *
187         * @param heuristic the {@code TextDirectionHeuristic} to use.
188         * @return the builder itself.
189         */
190        public Builder setTextDirectionHeuristic(TextDirectionHeuristicCompat heuristic) {
191            mTextDirectionHeuristicCompat = heuristic;
192            return this;
193        }
194
195        private static BidiFormatter getDefaultInstanceFromContext(boolean isRtlContext) {
196            return isRtlContext ? DEFAULT_RTL_INSTANCE : DEFAULT_LTR_INSTANCE;
197        }
198
199        /**
200         * @return A BidiFormatter with the specified options.
201         */
202        public BidiFormatter build() {
203            if (mFlags == DEFAULT_FLAGS &&
204                    mTextDirectionHeuristicCompat == DEFAULT_TEXT_DIRECTION_HEURISTIC) {
205                return getDefaultInstanceFromContext(mIsRtlContext);
206            }
207            return new BidiFormatter(mIsRtlContext, mFlags, mTextDirectionHeuristicCompat);
208        }
209    }
210
211    //
212    private static final int FLAG_STEREO_RESET = 2;
213    private static final int DEFAULT_FLAGS = FLAG_STEREO_RESET;
214
215    private static final BidiFormatter DEFAULT_LTR_INSTANCE = new BidiFormatter(
216            false /* LTR context */,
217            DEFAULT_FLAGS,
218            DEFAULT_TEXT_DIRECTION_HEURISTIC);
219
220    private static final BidiFormatter DEFAULT_RTL_INSTANCE = new BidiFormatter(
221            true /* RTL context */,
222            DEFAULT_FLAGS,
223            DEFAULT_TEXT_DIRECTION_HEURISTIC);
224
225    private final boolean mIsRtlContext;
226    private final int mFlags;
227    private final TextDirectionHeuristicCompat mDefaultTextDirectionHeuristicCompat;
228
229    /**
230     * Factory for creating an instance of BidiFormatter for the default locale directionality.
231     *
232     */
233    public static BidiFormatter getInstance() {
234        return new Builder().build();
235    }
236
237    /**
238     * Factory for creating an instance of BidiFormatter given the context directionality.
239     *
240     * @param rtlContext Whether the context directionality is RTL.
241     */
242    public static BidiFormatter getInstance(boolean rtlContext) {
243        return new Builder(rtlContext).build();
244    }
245
246    /**
247     * Factory for creating an instance of BidiFormatter given the context locale.
248     *
249     * @param locale The context locale.
250     */
251    public static BidiFormatter getInstance(Locale locale) {
252        return new Builder(locale).build();
253    }
254
255    /**
256     * @param isRtlContext Whether the context directionality is RTL or not.
257     * @param flags The option flags.
258     * @param heuristic The default text direction heuristic.
259     */
260    private BidiFormatter(boolean isRtlContext, int flags, TextDirectionHeuristicCompat heuristic) {
261        mIsRtlContext = isRtlContext;
262        mFlags = flags;
263        mDefaultTextDirectionHeuristicCompat = heuristic;
264    }
265
266    /**
267     * @return Whether the context directionality is RTL
268     */
269    public boolean isRtlContext() {
270        return mIsRtlContext;
271    }
272
273    /**
274     * @return Whether directionality "reset" should also be done before a string being
275     * bidi-wrapped, not just after it.
276     */
277    public boolean getStereoReset() {
278        return (mFlags & FLAG_STEREO_RESET) != 0;
279    }
280
281    /**
282     * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the
283     * overall or the exit directionality of a given string is opposite to the context directionality.
284     * Putting this after the string (including its directionality declaration wrapping) prevents it
285     * from "sticking" to other opposite-directionality text or a number appearing after it inline
286     * with only neutral content in between. Otherwise returns the empty string. While the exit
287     * directionality is determined by scanning the end of the string, the overall directionality is
288     * given explicitly by a heuristic to estimate the {@code str}'s directionality.
289     *
290     * @param str String after which the mark may need to appear.
291     * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s
292     *                  directionality.
293     * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context;
294     *     else, the empty string.
295     */
296    private String markAfter(String str, TextDirectionHeuristicCompat heuristic) {
297        final boolean isRtl = heuristic.isRtl(str, 0, str.length());
298        // getExitDir() is called only if needed (short-circuit).
299        if (!mIsRtlContext && (isRtl || getExitDir(str) == DIR_RTL)) {
300            return LRM_STRING;
301        }
302        if (mIsRtlContext && (!isRtl || getExitDir(str) == DIR_LTR)) {
303            return RLM_STRING;
304        }
305        return EMPTY_STRING;
306    }
307
308    /**
309     * Returns a Unicode bidi mark matching the context directionality (LRM or RLM) if either the
310     * overall or the entry directionality of a given string is opposite to the context
311     * directionality. Putting this before the string (including its directionality declaration
312     * wrapping) prevents it from "sticking" to other opposite-directionality text appearing before
313     * it inline with only neutral content in between. Otherwise returns the empty string. While the
314     * entry directionality is determined by scanning the beginning of the string, the overall
315     * directionality is given explicitly by a heuristic to estimate the {@code str}'s directionality.
316     *
317     * @param str String before which the mark may need to appear.
318     * @param heuristic The text direction heuristic that will be used to estimate the {@code str}'s
319     *                  directionality.
320     * @return LRM for RTL text in LTR context; RLM for LTR text in RTL context;
321     *     else, the empty string.
322     */
323    private String markBefore(String str, TextDirectionHeuristicCompat heuristic) {
324        final boolean isRtl = heuristic.isRtl(str, 0, str.length());
325        // getEntryDir() is called only if needed (short-circuit).
326        if (!mIsRtlContext && (isRtl || getEntryDir(str) == DIR_RTL)) {
327            return LRM_STRING;
328        }
329        if (mIsRtlContext && (!isRtl || getEntryDir(str) == DIR_LTR)) {
330            return RLM_STRING;
331        }
332        return EMPTY_STRING;
333    }
334
335    /**
336     * Estimates the directionality of a string using the default text direction heuristic.
337     *
338     * @param str String whose directionality is to be estimated.
339     * @return true if {@code str}'s estimated overall directionality is RTL. Otherwise returns
340     *          false.
341     */
342    public boolean isRtl(String str) {
343        return mDefaultTextDirectionHeuristicCompat.isRtl(str, 0, str.length());
344    }
345
346    /**
347     * Formats a string of given directionality for use in plain-text output of the context
348     * directionality, so an opposite-directionality string is neither garbled nor garbles its
349     * surroundings. This makes use of Unicode bidi formatting characters.
350     * <p>
351     * The algorithm: In case the given directionality doesn't match the context directionality, wraps
352     * the string with Unicode bidi formatting characters: RLE+{@code str}+PDF for RTL text, or
353     * LRE+{@code str}+PDF for LTR text.
354     * <p>
355     * If {@code isolate}, directionally isolates the string so that it does not garble its
356     * surroundings. Currently, this is done by "resetting" the directionality after the string by
357     * appending a trailing Unicode bidi mark matching the context directionality (LRM or RLM) when
358     * either the overall directionality or the exit directionality of the string is opposite to
359     * that of the context. Unless the formatter was built using
360     * {@link Builder#stereoReset(boolean)} with a {@code false} argument, also prepends a Unicode
361     * bidi mark matching the context directionality when either the overall directionality or the
362     * entry directionality of the string is opposite to that of the context. Note that as opposed
363     * to the overall directionality, the entry and exit directionalities are determined from the
364     * string itself.
365     * <p>
366     * Does *not* do HTML-escaping.
367     *
368     * @param str The input string.
369     * @param heuristic The algorithm to be used to estimate the string's overall direction.
370     * @param isolate Whether to directionally isolate the string to prevent it from garbling the
371     *     content around it
372     * @return Input string after applying the above processing. {@code null} if {@code str} is
373     *     {@code null}.
374     */
375    public String unicodeWrap(String str, TextDirectionHeuristicCompat heuristic, boolean isolate) {
376        if (str == null) return null;
377        final boolean isRtl = heuristic.isRtl(str, 0, str.length());
378        StringBuilder result = new StringBuilder();
379        if (getStereoReset() && isolate) {
380            result.append(markBefore(str,
381                    isRtl ? TextDirectionHeuristicsCompat.RTL : TextDirectionHeuristicsCompat.LTR));
382        }
383        if (isRtl != mIsRtlContext) {
384            result.append(isRtl ? RLE : LRE);
385            result.append(str);
386            result.append(PDF);
387        } else {
388            result.append(str);
389        }
390        if (isolate) {
391            result.append(markAfter(str,
392                    isRtl ? TextDirectionHeuristicsCompat.RTL : TextDirectionHeuristicsCompat.LTR));
393        }
394        return result.toString();
395    }
396
397    /**
398     * Operates like {@link #unicodeWrap(String, android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but assumes
399     * {@code isolate} is true.
400     *
401     * @param str The input string.
402     * @param heuristic The algorithm to be used to estimate the string's overall direction.
403     * @return Input string after applying the above processing.
404     */
405    public String unicodeWrap(String str, TextDirectionHeuristicCompat heuristic) {
406        return unicodeWrap(str, heuristic, true /* isolate */);
407    }
408
409    /**
410     * Operates like {@link #unicodeWrap(String, android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but uses the
411     * formatter's default direction estimation algorithm.
412     *
413     * @param str The input string.
414     * @param isolate Whether to directionally isolate the string to prevent it from garbling the
415     *     content around it
416     * @return Input string after applying the above processing.
417     */
418    public String unicodeWrap(String str, boolean isolate) {
419        return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, isolate);
420    }
421
422    /**
423     * Operates like {@link #unicodeWrap(String, android.support.v4.text.TextDirectionHeuristicCompat, boolean)}, but uses the
424     * formatter's default direction estimation algorithm and assumes {@code isolate} is true.
425     *
426     * @param str The input string.
427     * @return Input string after applying the above processing.
428     */
429    public String unicodeWrap(String str) {
430        return unicodeWrap(str, mDefaultTextDirectionHeuristicCompat, true /* isolate */);
431    }
432
433    /**
434     * Helper method to return true if the Locale directionality is RTL.
435     *
436     * @param locale The Locale whose directionality will be checked to be RTL or LTR
437     * @return true if the {@code locale} directionality is RTL. False otherwise.
438     */
439    private static boolean isRtlLocale(Locale locale) {
440        return (TextUtilsCompat.getLayoutDirectionFromLocale(locale) == ViewCompat.LAYOUT_DIRECTION_RTL);
441    }
442
443    /**
444     * Enum for directionality type.
445     */
446    private static final int DIR_LTR = -1;
447    private static final int DIR_UNKNOWN = 0;
448    private static final int DIR_RTL = +1;
449
450    /**
451     * Returns the directionality of the last character with strong directionality in the string, or
452     * DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards from the end of
453     * the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its matching PDF as a
454     * strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results are undefined for a
455     * string containing unbalanced LRE/RLE/LRO/RLO/PDF characters. The intended use is to check
456     * whether a logically separate item that starts with a number or a character of the string's
457     * exit directionality and follows this string inline (not counting any neutral characters in
458     * between) would "stick" to it in an opposite-directionality context, thus being displayed in
459     * an incorrect position. An LRM or RLM character (the one of the context's directionality)
460     * between the two will prevent such sticking.
461     *
462     * @param str the string to check.
463     */
464    private static int getExitDir(String str) {
465        return new DirectionalityEstimator(str, false /* isHtml */).getExitDir();
466    }
467
468    /**
469     * Returns the directionality of the first character with strong directionality in the string,
470     * or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an
471     * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL after
472     * RLE/RLO. The results are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF
473     * characters. The intended use is to check whether a logically separate item that ends with a
474     * character of the string's entry directionality and precedes the string inline (not counting
475     * any neutral characters in between) would "stick" to it in an opposite-directionality context,
476     * thus being displayed in an incorrect position. An LRM or RLM character (the one of the
477     * context's directionality) between the two will prevent such sticking.
478     *
479     * @param str the string to check.
480     */
481    private static int getEntryDir(String str) {
482        return new DirectionalityEstimator(str, false /* isHtml */).getEntryDir();
483    }
484
485    /**
486     * An object that estimates the directionality of a given string by various methods.
487     *
488     */
489    private static class DirectionalityEstimator {
490
491        // Internal static variables and constants.
492
493        /**
494         * Size of the bidi character class cache. The results of the Character.getDirectionality()
495         * calls on the lowest DIR_TYPE_CACHE_SIZE codepoints are kept in an array for speed.
496         * The 0x700 value is designed to leave all the European and Near Eastern languages in the
497         * cache. It can be reduced to 0x180, restricting the cache to the Western European
498         * languages.
499         */
500        private static final int DIR_TYPE_CACHE_SIZE = 0x700;
501
502        /**
503         * The bidi character class cache.
504         */
505        private static final byte DIR_TYPE_CACHE[];
506
507        static {
508            DIR_TYPE_CACHE = new byte[DIR_TYPE_CACHE_SIZE];
509            for (int i = 0; i < DIR_TYPE_CACHE_SIZE; i++) {
510                DIR_TYPE_CACHE[i] = Character.getDirectionality(i);
511            }
512        }
513
514        // Internal instance variables.
515
516        /**
517         * The text to be scanned.
518         */
519        private final String text;
520
521        /**
522         * Whether the text to be scanned is to be treated as HTML, i.e. skipping over tags and
523         * entities when looking for the next / preceding dir type.
524         */
525        private final boolean isHtml;
526
527        /**
528         * The length of the text in chars.
529         */
530        private final int length;
531
532        /**
533         * The current position in the text.
534         */
535        private int charIndex;
536
537        /**
538         * The char encountered by the last dirTypeForward or dirTypeBackward call. If it
539         * encountered a supplementary codepoint, this contains a char that is not a valid
540         * codepoint. This is ok, because this member is only used to detect some well-known ASCII
541         * syntax, e.g. "http://" and the beginning of an HTML tag or entity.
542         */
543        private char lastChar;
544
545        /**
546         * Constructor.
547         *
548         * @param text The string to scan.
549         * @param isHtml Whether the text to be scanned is to be treated as HTML, i.e. skipping over
550         *     tags and entities.
551         */
552        DirectionalityEstimator(String text, boolean isHtml) {
553            this.text = text;
554            this.isHtml = isHtml;
555            length = text.length();
556        }
557
558        /**
559         * Returns the directionality of the first character with strong directionality in the
560         * string, or DIR_UNKNOWN if none was encountered. Treats a non-BN character between an
561         * LRE/RLE/LRO/RLO and its matching PDF as a strong character, LTR after LRE/LRO, and RTL
562         * after RLE/RLO. The results are undefined for a string containing unbalanced
563         * LRE/RLE/LRO/RLO/PDF characters.
564         */
565        int getEntryDir() {
566            // The reason for this method name, as opposed to getFirstStrongDir(), is that
567            // "first strong" is a commonly used description of Unicode's estimation algorithm,
568            // but the two must treat formatting characters quite differently. Thus, we are staying
569            // away from both "first" and "last" in these method names to avoid confusion.
570            charIndex = 0;
571            int embeddingLevel = 0;
572            int embeddingLevelDir = DIR_UNKNOWN;
573            int firstNonEmptyEmbeddingLevel = 0;
574            while (charIndex < length && firstNonEmptyEmbeddingLevel == 0) {
575                switch (dirTypeForward()) {
576                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
577                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
578                        ++embeddingLevel;
579                        embeddingLevelDir = DIR_LTR;
580                        break;
581                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
582                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
583                        ++embeddingLevel;
584                        embeddingLevelDir = DIR_RTL;
585                        break;
586                    case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
587                        --embeddingLevel;
588                        // To restore embeddingLevelDir to its previous value, we would need a
589                        // stack, which we want to avoid. Thus, at this point we do not know the
590                        // current embedding's directionality.
591                        embeddingLevelDir = DIR_UNKNOWN;
592                        break;
593                    case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL:
594                        break;
595                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
596                        if (embeddingLevel == 0) {
597                            return DIR_LTR;
598                        }
599                        firstNonEmptyEmbeddingLevel = embeddingLevel;
600                        break;
601                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
602                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
603                        if (embeddingLevel == 0) {
604                            return DIR_RTL;
605                        }
606                        firstNonEmptyEmbeddingLevel = embeddingLevel;
607                        break;
608                    default:
609                        firstNonEmptyEmbeddingLevel = embeddingLevel;
610                        break;
611                }
612            }
613
614            // We have either found a non-empty embedding or scanned the entire string finding
615            // neither a non-empty embedding nor a strong character outside of an embedding.
616            if (firstNonEmptyEmbeddingLevel == 0) {
617                // We have not found a non-empty embedding. Thus, the string contains neither a
618                // non-empty embedding nor a strong character outside of an embedding.
619                return DIR_UNKNOWN;
620            }
621
622            // We have found a non-empty embedding.
623            if (embeddingLevelDir != DIR_UNKNOWN) {
624                // We know the directionality of the non-empty embedding.
625                return embeddingLevelDir;
626            }
627
628            // We do not remember the directionality of the non-empty embedding we found. So, we go
629            // backwards to find the start of the non-empty embedding and get its directionality.
630            while (charIndex > 0) {
631                switch (dirTypeBackward()) {
632                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
633                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
634                        if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
635                            return DIR_LTR;
636                        }
637                        --embeddingLevel;
638                        break;
639                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
640                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
641                        if (firstNonEmptyEmbeddingLevel == embeddingLevel) {
642                            return DIR_RTL;
643                        }
644                        --embeddingLevel;
645                        break;
646                    case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
647                        ++embeddingLevel;
648                        break;
649                }
650            }
651            // We should never get here.
652            return DIR_UNKNOWN;
653        }
654
655        /**
656         * Returns the directionality of the last character with strong directionality in the
657         * string, or DIR_UNKNOWN if none was encountered. For efficiency, actually scans backwards
658         * from the end of the string. Treats a non-BN character between an LRE/RLE/LRO/RLO and its
659         * matching PDF as a strong character, LTR after LRE/LRO, and RTL after RLE/RLO. The results
660         * are undefined for a string containing unbalanced LRE/RLE/LRO/RLO/PDF characters.
661         */
662        int getExitDir() {
663            // The reason for this method name, as opposed to getLastStrongDir(), is that "last
664            // strong" sounds like the exact opposite of "first strong", which is a commonly used
665            // description of Unicode's estimation algorithm (getUnicodeDir() above), but the two
666            // must treat formatting characters quite differently. Thus, we are staying away from
667            // both "first" and "last" in these method names to avoid confusion.
668            charIndex = length;
669            int embeddingLevel = 0;
670            int lastNonEmptyEmbeddingLevel = 0;
671            while (charIndex > 0) {
672                switch (dirTypeBackward()) {
673                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT:
674                        if (embeddingLevel == 0) {
675                            return DIR_LTR;
676                        }
677                        if (lastNonEmptyEmbeddingLevel == 0) {
678                            lastNonEmptyEmbeddingLevel = embeddingLevel;
679                        }
680                        break;
681                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING:
682                    case Character.DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE:
683                        if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
684                            return DIR_LTR;
685                        }
686                        --embeddingLevel;
687                        break;
688                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT:
689                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC:
690                        if (embeddingLevel == 0) {
691                            return DIR_RTL;
692                        }
693                        if (lastNonEmptyEmbeddingLevel == 0) {
694                            lastNonEmptyEmbeddingLevel = embeddingLevel;
695                        }
696                        break;
697                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING:
698                    case Character.DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE:
699                        if (lastNonEmptyEmbeddingLevel == embeddingLevel) {
700                            return DIR_RTL;
701                        }
702                        --embeddingLevel;
703                        break;
704                    case Character.DIRECTIONALITY_POP_DIRECTIONAL_FORMAT:
705                        ++embeddingLevel;
706                        break;
707                    case Character.DIRECTIONALITY_BOUNDARY_NEUTRAL:
708                        break;
709                    default:
710                        if (lastNonEmptyEmbeddingLevel == 0) {
711                            lastNonEmptyEmbeddingLevel = embeddingLevel;
712                        }
713                        break;
714                }
715            }
716            return DIR_UNKNOWN;
717        }
718
719        // Internal methods
720
721        /**
722         * Gets the bidi character class, i.e. Character.getDirectionality(), of a given char, using
723         * a cache for speed. Not designed for supplementary codepoints, whose results we do not
724         * cache.
725         */
726        private static byte getCachedDirectionality(char c) {
727            return c < DIR_TYPE_CACHE_SIZE ? DIR_TYPE_CACHE[c] : Character.getDirectionality(c);
728        }
729
730        /**
731         * Returns the Character.DIRECTIONALITY_... value of the next codepoint and advances
732         * charIndex. If isHtml, and the codepoint is '<' or '&', advances through the tag/entity,
733         * and returns Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to
734         * figure out the actual character, and return its dirtype, but treating it as whitespace is
735         * good enough for our purposes.
736         *
737         * @throws java.lang.IndexOutOfBoundsException if called when charIndex >= length or < 0.
738         */
739        byte dirTypeForward() {
740            lastChar = text.charAt(charIndex);
741            if (Character.isHighSurrogate(lastChar)) {
742                int codePoint = Character.codePointAt(text, charIndex);
743                charIndex += Character.charCount(codePoint);
744                return Character.getDirectionality(codePoint);
745            }
746            charIndex++;
747            byte dirType = getCachedDirectionality(lastChar);
748            if (isHtml) {
749                // Process tags and entities.
750                if (lastChar == '<') {
751                    dirType = skipTagForward();
752                } else if (lastChar == '&') {
753                    dirType = skipEntityForward();
754                }
755            }
756            return dirType;
757        }
758
759        /**
760         * Returns the Character.DIRECTIONALITY_... value of the preceding codepoint and advances
761         * charIndex backwards. If isHtml, and the codepoint is the end of a complete HTML tag or
762         * entity, advances over the whole tag/entity and returns
763         * Character.DIRECTIONALITY_WHITESPACE. For an entity, it would be best to figure out the
764         * actual character, and return its dirtype, but treating it as whitespace is good enough
765         * for our purposes.
766         *
767         * @throws java.lang.IndexOutOfBoundsException if called when charIndex > length or <= 0.
768         */
769        byte dirTypeBackward() {
770            lastChar = text.charAt(charIndex - 1);
771            if (Character.isLowSurrogate(lastChar)) {
772                int codePoint = Character.codePointBefore(text, charIndex);
773                charIndex -= Character.charCount(codePoint);
774                return Character.getDirectionality(codePoint);
775            }
776            charIndex--;
777            byte dirType = getCachedDirectionality(lastChar);
778            if (isHtml) {
779                // Process tags and entities.
780                if (lastChar == '>') {
781                    dirType = skipTagBackward();
782                } else if (lastChar == ';') {
783                    dirType = skipEntityBackward();
784                }
785            }
786            return dirType;
787        }
788
789        /**
790         * Advances charIndex forward through an HTML tag (after the opening &lt; has already been
791         * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching &gt;,
792         * does not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the
793         * &lt; that hadn't been part of a tag after all).
794         */
795        private byte skipTagForward() {
796            int initialCharIndex = charIndex;
797            while (charIndex < length) {
798                lastChar = text.charAt(charIndex++);
799                if (lastChar == '>') {
800                    // The end of the tag.
801                    return Character.DIRECTIONALITY_WHITESPACE;
802                }
803                if (lastChar == '"' || lastChar == '\'') {
804                    // Skip over a quoted attribute value inside the tag.
805                    char quote = lastChar;
806                    while (charIndex < length && (lastChar = text.charAt(charIndex++)) != quote) {}
807                }
808            }
809            // The original '<' wasn't the start of a tag after all.
810            charIndex = initialCharIndex;
811            lastChar = '<';
812            return Character.DIRECTIONALITY_OTHER_NEUTRALS;
813        }
814
815        /**
816         * Advances charIndex backward through an HTML tag (after the closing &gt; has already been
817         * read) and returns Character.DIRECTIONALITY_WHITESPACE. If there is no matching &lt;, does
818         * not change charIndex and returns Character.DIRECTIONALITY_OTHER_NEUTRALS (for the &gt;
819         * that hadn't been part of a tag after all). Nevertheless, the running time for calling
820         * skipTagBackward() in a loop remains linear in the size of the text, even for a text like
821         * "&gt;&gt;&gt;&gt;", because skipTagBackward() also stops looking for a matching &lt;
822         * when it encounters another &gt;.
823         */
824        private byte skipTagBackward() {
825            int initialCharIndex = charIndex;
826            while (charIndex > 0) {
827                lastChar = text.charAt(--charIndex);
828                if (lastChar == '<') {
829                    // The start of the tag.
830                    return Character.DIRECTIONALITY_WHITESPACE;
831                }
832                if (lastChar == '>') {
833                    break;
834                }
835                if (lastChar == '"' || lastChar == '\'') {
836                    // Skip over a quoted attribute value inside the tag.
837                    char quote = lastChar;
838                    while (charIndex > 0 && (lastChar = text.charAt(--charIndex)) != quote) {}
839                }
840            }
841            // The original '>' wasn't the end of a tag after all.
842            charIndex = initialCharIndex;
843            lastChar = '>';
844            return Character.DIRECTIONALITY_OTHER_NEUTRALS;
845        }
846
847        /**
848         * Advances charIndex forward through an HTML character entity tag (after the opening
849         * &amp; has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be
850         * best to figure out the actual character and return its dirtype, but this is good enough.
851         */
852        private byte skipEntityForward() {
853            while (charIndex < length && (lastChar = text.charAt(charIndex++)) != ';') {}
854            return Character.DIRECTIONALITY_WHITESPACE;
855        }
856
857        /**
858         * Advances charIndex backward through an HTML character entity tag (after the closing ;
859         * has already been read) and returns Character.DIRECTIONALITY_WHITESPACE. It would be best
860         * to figure out the actual character and return its dirtype, but this is good enough.
861         * If there is no matching &amp;, does not change charIndex and returns
862         * Character.DIRECTIONALITY_OTHER_NEUTRALS (for the ';' that did not start an entity after
863         * all). Nevertheless, the running time for calling skipEntityBackward() in a loop remains
864         * linear in the size of the text, even for a text like ";;;;;;;", because skipTagBackward()
865         * also stops looking for a matching &amp; when it encounters another ;.
866         */
867        private byte skipEntityBackward() {
868            int initialCharIndex = charIndex;
869            while (charIndex > 0) {
870                lastChar = text.charAt(--charIndex);
871                if (lastChar == '&') {
872                    return Character.DIRECTIONALITY_WHITESPACE;
873                }
874                if (lastChar == ';') {
875                    break;
876                }
877            }
878            charIndex = initialCharIndex;
879            lastChar = ';';
880            return Character.DIRECTIONALITY_OTHER_NEUTRALS;
881        }
882    }
883}