diff options
Diffstat (limited to 'libjava/gnu/gcj/text')
-rw-r--r-- | libjava/gnu/gcj/text/BaseBreakIterator.java | 82 | ||||
-rw-r--r-- | libjava/gnu/gcj/text/CharacterBreakIterator.java | 188 | ||||
-rw-r--r-- | libjava/gnu/gcj/text/LineBreakIterator.java | 168 | ||||
-rw-r--r-- | libjava/gnu/gcj/text/LocaleData_en.java | 81 | ||||
-rw-r--r-- | libjava/gnu/gcj/text/LocaleData_en_US.java | 71 | ||||
-rw-r--r-- | libjava/gnu/gcj/text/SentenceBreakIterator.java | 226 | ||||
-rw-r--r-- | libjava/gnu/gcj/text/WordBreakIterator.java | 224 |
7 files changed, 0 insertions, 1040 deletions
diff --git a/libjava/gnu/gcj/text/BaseBreakIterator.java b/libjava/gnu/gcj/text/BaseBreakIterator.java deleted file mode 100644 index 8c20d46d47c..00000000000 --- a/libjava/gnu/gcj/text/BaseBreakIterator.java +++ /dev/null @@ -1,82 +0,0 @@ -// Base class for default BreakIterators. - -/* Copyright (C) 1999 Free Software Foundation - - This file is part of libgcj. - -This software is copyrighted work licensed under the terms of the -Libgcj License. Please consult the file "LIBGCJ_LICENSE" for -details. */ - -package gnu.gcj.text; - -import java.text.BreakIterator; -import java.text.CharacterIterator; - -/** - * @author Tom Tromey <tromey@cygnus.com> - * @date March 22, 1999 - */ - -public abstract class BaseBreakIterator extends BreakIterator -{ - public int current () - { - return iter.getIndex(); - } - - public int first () - { - iter.first(); - return iter.getBeginIndex(); - } - - public int following (int pos) - { - int save = iter.getIndex(); - iter.setIndex(pos); - int r = next (); - iter.setIndex(save); - return r; - } - - public CharacterIterator getText () - { - return iter; - } - - public int last () - { - iter.last(); - return iter.getEndIndex(); - } - - public int next (int n) - { - int r = iter.getIndex (); - if (n > 0) - { - while (n > 0 && r != DONE) - { - r = next (); - --n; - } - } - else if (n < 0) - { - while (n < 0 && r != DONE) - { - r = previous (); - ++n; - } - } - return r; - } - - public void setText (CharacterIterator newText) - { - iter = newText; - } - - protected CharacterIterator iter; -} diff --git a/libjava/gnu/gcj/text/CharacterBreakIterator.java b/libjava/gnu/gcj/text/CharacterBreakIterator.java deleted file mode 100644 index ba087ccc461..00000000000 --- a/libjava/gnu/gcj/text/CharacterBreakIterator.java +++ /dev/null @@ -1,188 +0,0 @@ -// Default character BreakIterator. - -/* Copyright (C) 1999 Free Software Foundation - - This file is part of libgcj. - -This software is copyrighted work licensed under the terms of the -Libgcj License. Please consult the file "LIBGCJ_LICENSE" for -details. */ - -package gnu.gcj.text; - -import java.text.BreakIterator; -import java.text.CharacterIterator; - -/** - * @author Tom Tromey <tromey@cygnus.com> - * @date March 19, 1999 - * Written using The Unicode Standard, Version 2.0. - */ - -public class CharacterBreakIterator extends BaseBreakIterator -{ - // Hangul Jamo constants from Unicode book. - private static final int LBase = 0x1100; - private static final int VBase = 0x1161; - private static final int TBase = 0x11a7; - private static final int LCount = 19; - private static final int VCount = 21; - private static final int TCount = 28; - - // Information about surrogates. - private static final int highSurrogateStart = 0xD800; - private static final int highSurrogateEnd = 0xDBFF; - private static final int lowSurrogateStart = 0xDC00; - private static final int lowSurrogateEnd = 0xDFFF; - - public Object clone () - { - return new CharacterBreakIterator (this); - } - - public CharacterBreakIterator () - { - iter = null; // FIXME? - } - - private CharacterBreakIterator (CharacterBreakIterator other) - { - iter = (CharacterIterator) other.iter.clone(); - } - - // Some methods to tell us different properties of characters. - private final boolean isL (char c) - { - return c >= LBase && c <= LBase + LCount; - } - private final boolean isV (char c) - { - return c >= VBase && c <= VBase + VCount; - } - private final boolean isT (char c) - { - return c >= TBase && c <= TBase + TCount; - } - private final boolean isLVT (char c) - { - return isL (c) || isV (c) || isT (c); - } - private final boolean isHighSurrogate (char c) - { - return c >= highSurrogateStart && c <= highSurrogateEnd; - } - private final boolean isLowSurrogate (char c) - { - return c >= lowSurrogateStart && c <= lowSurrogateEnd; - } - - public int next () - { - int end = iter.getEndIndex(); - if (iter.getIndex() == end) - return DONE; - - char c; - for (char prev = CharacterIterator.DONE; iter.getIndex() < end; prev = c) - { - c = iter.next(); - if (c == CharacterIterator.DONE) - break; - int type = Character.getType(c); - - // Break after paragraph separators. - if (type == Character.PARAGRAPH_SEPARATOR) - break; - - // Now we need some lookahead. - char ahead = iter.next(); - iter.previous(); - if (ahead == CharacterIterator.DONE) - break; - int aheadType = Character.getType(ahead); - - if (aheadType != Character.NON_SPACING_MARK - && ! isLowSurrogate (ahead) - && ! isLVT (ahead)) - break; - if (! isLVT (c) && isLVT (ahead)) - break; - if (isL (c) && ! isLVT (ahead) - && aheadType != Character.NON_SPACING_MARK) - break; - if (isV (c) && ! isV (ahead) && !isT (ahead) - && aheadType != Character.NON_SPACING_MARK) - break; - if (isT (c) && ! isT (ahead) - && aheadType != Character.NON_SPACING_MARK) - break; - - if (! isHighSurrogate (c) && isLowSurrogate (ahead)) - break; - if (isHighSurrogate (c) && ! isLowSurrogate (ahead)) - break; - if (! isHighSurrogate (prev) && isLowSurrogate (c)) - break; - } - - return iter.getIndex(); - } - - public int previous () - { - if (iter.getIndex() == iter.getBeginIndex()) - return DONE; - - int start = iter.getBeginIndex(); - while (iter.getIndex() >= iter.getBeginIndex()) - { - char c = iter.previous(); - if (c == CharacterIterator.DONE) - break; - int type = Character.getType(c); - - if (type != Character.NON_SPACING_MARK - && ! isLowSurrogate (c) - && ! isLVT (c)) - break; - - // Now we need some lookahead. - char ahead = iter.previous(); - if (ahead == CharacterIterator.DONE) - { - iter.next(); - break; - } - char ahead2 = iter.previous(); - iter.next(); - iter.next(); - if (ahead2 == CharacterIterator.DONE) - break; - int aheadType = Character.getType(ahead); - - if (aheadType == Character.PARAGRAPH_SEPARATOR) - break; - - if (isLVT (c) && ! isLVT (ahead)) - break; - if (! isLVT (c) && type != Character.NON_SPACING_MARK - && isL (ahead)) - break; - if (! isV (c) && ! isT (c) && type != Character.NON_SPACING_MARK - && isV (ahead)) - break; - if (! isT (c) && type != Character.NON_SPACING_MARK - && isT (ahead)) - break; - - if (isLowSurrogate (c) && ! isHighSurrogate (ahead)) - break; - if (! isLowSurrogate (c) && isHighSurrogate (ahead)) - break; - if (isLowSurrogate (ahead) && ! isHighSurrogate (ahead2)) - break; - } - - return iter.getIndex(); - } -} diff --git a/libjava/gnu/gcj/text/LineBreakIterator.java b/libjava/gnu/gcj/text/LineBreakIterator.java deleted file mode 100644 index 4540b7ae549..00000000000 --- a/libjava/gnu/gcj/text/LineBreakIterator.java +++ /dev/null @@ -1,168 +0,0 @@ -// Default word BreakIterator. - -/* Copyright (C) 1999 Free Software Foundation - - This file is part of libgcj. - -This software is copyrighted work licensed under the terms of the -Libgcj License. Please consult the file "LIBGCJ_LICENSE" for -details. */ - -package gnu.gcj.text; - -import java.text.BreakIterator; -import java.text.CharacterIterator; - -/** - * @author Tom Tromey <tromey@cygnus.com> - * @date March 22, 1999 - * Written using The Unicode Standard, Version 2.0. - */ - -public class LineBreakIterator extends BaseBreakIterator -{ - public Object clone () - { - return new LineBreakIterator (this); - } - - public LineBreakIterator () - { - iter = null; - } - - private LineBreakIterator (LineBreakIterator other) - { - iter = (CharacterIterator) other.iter.clone(); - } - - // Some methods to tell us different properties of characters. - private final boolean isNb (char c) - { - return (c == 0x00a0 // NO-BREAK SPACE - || c == 0x2011 // NON-BREAKING HYPHEN - || c == 0xfeff); // ZERO WITH NO-BREAK SPACE - } - private final boolean isClose (int type) - { - return (type == Character.END_PUNCTUATION - // Unicode book says "comma, period, ...", which I take to - // mean "Po" class. - || type == Character.OTHER_PUNCTUATION); - } - private final boolean isIdeo (char c) - { - return (c >= 0x3040 && c <= 0x309f // Hiragana - || c >= 0x30a0 && c <= 0x30ff // Katakana - || c >= 0x4e00 && c <= 0x9fff // Han - || c >= 0x3100 && c <= 0x312f); // Bopomofo - } - - public int next () - { - int end = iter.getEndIndex(); - if (iter.getIndex() == end) - return DONE; - - while (iter.getIndex() < end) - { - char c = iter.current(); - int type = Character.getType(c); - - char n = iter.next(); - - if (n == CharacterIterator.DONE - || type == Character.PARAGRAPH_SEPARATOR - || type == Character.LINE_SEPARATOR) - break; - - // Handle two cases where we must scan for non-spacing marks. - int start = iter.getIndex(); - if (type == Character.SPACE_SEPARATOR - || type == Character.START_PUNCTUATION - || isIdeo (c)) - { - while (n != CharacterIterator.DONE - && Character.getType(n) == Character.NON_SPACING_MARK) - n = iter.next(); - if (n == CharacterIterator.DONE) - break; - - if (type == Character.SPACE_SEPARATOR) - { - int nt = Character.getType(n); - if (nt != Character.NON_SPACING_MARK - && nt != Character.SPACE_SEPARATOR - && ! isNb (n)) - break; - } - else if (type == Character.START_PUNCTUATION) - { - if (isIdeo (n)) - { - // Open punctuation followed by non spacing marks - // and then ideograph does not have a break in - // it. So skip all this. - start = iter.getIndex(); - } - } - else - { - // Ideograph preceded this character. - if (isClose (Character.getType(n))) - break; - } - } - iter.setIndex(start); - } - - return iter.getIndex(); - } - - public int previous () - { - int start = iter.getBeginIndex(); - if (iter.getIndex() == start) - return DONE; - - while (iter.getIndex() >= start) - { - char c = iter.previous(); - if (c == CharacterIterator.DONE) - break; - int type = Character.getType(c); - - char n = iter.previous(); - if (n == CharacterIterator.DONE) - break; - iter.next(); - - int nt = Character.getType(n); - // Break after paragraph separators. - if (nt == Character.PARAGRAPH_SEPARATOR - || nt == Character.LINE_SEPARATOR) - break; - - // Skip non-spacing marks. - int init = iter.getIndex(); - while (n != CharacterIterator.DONE && nt == Character.NON_SPACING_MARK) - { - n = iter.previous(); - nt = Character.getType(n); - } - - if (nt == Character.SPACE_SEPARATOR - && type != Character.SPACE_SEPARATOR - && type != Character.NON_SPACING_MARK - && ! isNb (c)) - break; - if (! isClose (type) && isIdeo (n)) - break; - if (isIdeo (c) && nt != Character.START_PUNCTUATION) - break; - iter.setIndex(init); - } - - return iter.getIndex(); - } -} diff --git a/libjava/gnu/gcj/text/LocaleData_en.java b/libjava/gnu/gcj/text/LocaleData_en.java deleted file mode 100644 index 161f3de7e8d..00000000000 --- a/libjava/gnu/gcj/text/LocaleData_en.java +++ /dev/null @@ -1,81 +0,0 @@ -// Generic English locale data for java.text. - -/* Copyright (C) 1999 Free Software Foundation - - This file is part of libgcj. - -This software is copyrighted work licensed under the terms of the -Libgcj License. Please consult the file "LIBGCJ_LICENSE" for -details. */ - -package gnu.gcj.text; - -import java.util.ListResourceBundle; - -/** - * @author Tom Tromey <tromey@cygnus.com> - * @date March 4, 1999 - */ - -public final class LocaleData_en extends ListResourceBundle -{ - // These are for DateFormatSymbols. - static final String[] ampmsDefault = {"AM", "PM" }; - static final String[] erasDefault = {"BC", "AD" }; - static final String localPatternCharsDefault = "GyMdkHmsSEDFwWahKz"; - static final String[] monthsDefault = { - "January", "February", "March", "April", "May", "June", - "July", "August", "September", "October", "November", "December", "" - }; - static final String[] shortMonthsDefault = { - "Jan", "Feb", "Mar", "Apr", "May", "Jun", - "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "" - }; - static final String[] shortWeekdaysDefault = { - "", "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" - }; - static final String[] weekdaysDefault = { - "", "Sunday", "Monday", "Tuesday", - "Wednesday", "Thursday", "Friday", "Saturday" - }; - - private static final Object[][] contents = - { - // These are for DecimalFormatSymbols. - { "decimalSeparator", "." }, - { "digit", "#" }, - { "exponential", "E" }, - { "groupingSeparator", "," }, - { "infinity", "\u221e" }, - { "minusSign", "-" }, - { "NaN", "\ufffd" }, - { "patternSeparator", ";" }, - { "percent", "%" }, - { "perMill", "\u2030" }, - { "zeroDigit", "0" }, - - // These are for NumberFormat. - { "numberFormat", "#,##0.###" }, - { "percentFormat", "#,##0%" }, - - // These are for DateFormatSymbols. - { "ampm", ampmsDefault }, - { "eras", erasDefault }, - { "datePatternChars", localPatternCharsDefault }, - { "months", monthsDefault }, - { "shortMonths", shortMonthsDefault }, - { "shortWeekdays", shortWeekdaysDefault }, - { "weekdays", weekdaysDefault }, - - // For RuleBasedCollator. - // FIXME: this is nowhere near complete. - // In particular we must mark accents as ignorable, - // and probably other things as well. - { "collatorRule", "< 0 < 1 < 2 < 3 < 4 < 5 < 6 < 7 < 8 < 9 < a,A < b,B < c,C < d,D < e,E < f,F < g,G < h,H < i,I < j,J < k,K < l,L < m,M < n,N < o,O < p,P < q,Q < r,R < s,S < t,T < u,U < v,V < w,W < x,X < y,Y < z,Z" } - }; - - protected Object[][] getContents () - { - return contents; - } -} diff --git a/libjava/gnu/gcj/text/LocaleData_en_US.java b/libjava/gnu/gcj/text/LocaleData_en_US.java deleted file mode 100644 index 083a86157f6..00000000000 --- a/libjava/gnu/gcj/text/LocaleData_en_US.java +++ /dev/null @@ -1,71 +0,0 @@ -// US English locale data for java.text. - -/* Copyright (C) 1999 Free Software Foundation - - This file is part of libgcj. - -This software is copyrighted work licensed under the terms of the -Libgcj License. Please consult the file "LIBGCJ_LICENSE" for -details. */ - -package gnu.gcj.text; - -import java.util.ListResourceBundle; - -/** - * @author Tom Tromey <tromey@cygnus.com> - * @date March 4, 1999 - */ - -public final class LocaleData_en_US extends ListResourceBundle -{ - // These are for DateFormatSymbols. - static String[][] zoneStringsDefault = { - { "PST", "Pacific Standard Time", "PST", - /**/ "Pacific Daylight Time", "PDT", "San Francisco" }, - { "MST", "Mountain Standard Time", "MST", - /**/ "Mountain Daylight Time", "MDT", "Denver" }, - { "PNT", "Mountain Standard Time", "MST", - /**/ "Mountain Standard Time", "MST", "Phoenix" }, - { "CST", "Central Standard Time", "CST", - /**/ "Central Daylight Time", "CDT", "Chicago" }, - { "EST", "Eastern Standard Time", "EST", - /**/ "Eastern Daylight Time", "EDT", "Boston" }, - { "IET", "Eastern Standard Time", "EST", - /**/ "Eastern Standard Time", "EST", "Indianapolis" }, - { "PRT", "Atlantic Standard Time", "AST", - /**/ "Atlantic Daylight Time", "ADT", "Halifax" }, - { "HST", "Hawaii Standard Time", "HST", - /**/ "Hawaii Daylight Time", "HDT", "Honolulu" }, - { "AST", "Alaska Standard Time", "AST", - /**/ "Alaska Daylight Time", "ADT", "Anchorage" } - }; - - private static final Object[][] contents = - { - // These are for DecimalFormatSymbols. - { "currency", "$" }, - { "intlCurrencySymbol", "$" }, // FIXME? - - // These are for NumberFormat. - { "currencyFormat", "$#,##0.00;($#,##0.00)" }, - - // These are for DateFormatSymbols. - { "zoneStrings", zoneStringsDefault }, - - // These are for DateFormat. - { "shortDateFormat", "M/d/yy" }, // Java's Y2K bug. - { "mediumDateFormat", "d-MMM-yy" }, - { "longDateFormat", "MMMM d, yyyy" }, - { "fullDateFormat", "EEEE MMMM d, yyyy G" }, - { "shortTimeFormat", "h:mm a" }, - { "mediumTimeFormat", "h:mm:ss a" }, - { "longTimeFormat", "h:mm:ss a z" }, - { "fullTimeFormat", "h:mm:ss;S 'o''clock' a z" } - }; - - protected Object[][] getContents () - { - return contents; - } -} diff --git a/libjava/gnu/gcj/text/SentenceBreakIterator.java b/libjava/gnu/gcj/text/SentenceBreakIterator.java deleted file mode 100644 index af2ccf10b93..00000000000 --- a/libjava/gnu/gcj/text/SentenceBreakIterator.java +++ /dev/null @@ -1,226 +0,0 @@ -// Default sentence BreakIterator. - -/* Copyright (C) 1999 Free Software Foundation - - This file is part of libgcj. - -This software is copyrighted work licensed under the terms of the -Libgcj License. Please consult the file "LIBGCJ_LICENSE" for -details. */ - -package gnu.gcj.text; - -import java.text.BreakIterator; -import java.text.CharacterIterator; - -/** - * @author Tom Tromey <tromey@cygnus.com> - * @date March 23, 1999 - * Written using The Unicode Standard, Version 2.0. - */ - -public class SentenceBreakIterator extends BaseBreakIterator -{ - public Object clone () - { - return new SentenceBreakIterator (this); - } - - public SentenceBreakIterator () - { - iter = null; - } - - private SentenceBreakIterator (SentenceBreakIterator other) - { - iter = (CharacterIterator) other.iter.clone(); - } - - public int next () - { - int end = iter.getEndIndex(); - if (iter.getIndex() == end) - return DONE; - - while (iter.getIndex() < end) - { - char c = iter.current(); - if (c == CharacterIterator.DONE) - break; - int type = Character.getType(c); - - char n = iter.next(); - if (n == CharacterIterator.DONE) - break; - - // Always break after paragraph separator. - if (type == Character.PARAGRAPH_SEPARATOR) - break; - - if (c == '!' || c == '?') - { - // Skip close punctuation. - while (n != CharacterIterator.DONE - && Character.getType(n) == Character.END_PUNCTUATION) - n = iter.next(); - // Skip spaces. - while (n != CharacterIterator.DONE - && Character.getType(n) == Character.SPACE_SEPARATOR) - n = iter.next(); - // Skip optional paragraph separator. - if (n != CharacterIterator.DONE - && Character.getType(n) == Character.PARAGRAPH_SEPARATOR) - n = iter.next(); - - // There's always a break somewhere after `!' or `?'. - break; - } - - if (c == '.') - { - int save = iter.getIndex(); - // Skip close punctuation. - while (n != CharacterIterator.DONE - && Character.getType(n) == Character.END_PUNCTUATION) - n = iter.next(); - // Skip spaces. We keep count because we need at least - // one for this period to represent a terminator. - int spcount = 0; - while (n != CharacterIterator.DONE - && Character.getType(n) == Character.SPACE_SEPARATOR) - { - n = iter.next(); - ++spcount; - } - if (spcount > 0) - { - int save2 = iter.getIndex(); - // Skip over open puncutation. - while (n != CharacterIterator.DONE - && Character.getType(n) == Character.START_PUNCTUATION) - n = iter.next(); - // Next character must not be lower case. - if (n == CharacterIterator.DONE - || ! Character.isLowerCase(n)) - { - iter.setIndex(save2); - break; - } - } - iter.setIndex(save); - } - } - - return iter.getIndex(); - } - - private final int previous_internal () - { - int start = iter.getBeginIndex(); - if (iter.getIndex() == start) - return DONE; - - while (iter.getIndex() >= start) - { - char c = iter.previous(); - if (c == CharacterIterator.DONE) - break; - - char n = iter.previous(); - if (n == CharacterIterator.DONE) - break; - iter.next(); - int nt = Character.getType(n); - - if (! Character.isLowerCase(c) - && (nt == Character.START_PUNCTUATION - || nt == Character.SPACE_SEPARATOR)) - { - int save = iter.getIndex(); - int save_nt = nt; - char save_n = n; - // Skip open punctuation. - while (n != CharacterIterator.DONE - && Character.getType(n) == Character.START_PUNCTUATION) - n = iter.previous(); - if (n == CharacterIterator.DONE) - break; - if (Character.getType(n) == Character.SPACE_SEPARATOR) - { - // Must have at least once space after the `.'. - int save2 = iter.getIndex(); - while (n != CharacterIterator.DONE - && Character.getType(n) == Character.SPACE_SEPARATOR) - n = iter.previous(); - // Skip close punctuation. - while (n != CharacterIterator.DONE - && Character.getType(n) == Character.END_PUNCTUATION) - n = iter.previous(); - if (n == CharacterIterator.DONE || n == '.') - { - // Communicate location of actual end. - period = iter.getIndex(); - iter.setIndex(save2); - break; - } - } - iter.setIndex(save); - nt = save_nt; - n = save_n; - } - - if (nt == Character.PARAGRAPH_SEPARATOR) - { - // Communicate location of actual end. - period = iter.getIndex(); - break; - } - else if (nt == Character.SPACE_SEPARATOR - || nt == Character.END_PUNCTUATION) - { - int save = iter.getIndex(); - // Skip spaces. - while (n != CharacterIterator.DONE - && Character.getType(n) == Character.SPACE_SEPARATOR) - n = iter.previous(); - // Skip close punctuation. - while (n != CharacterIterator.DONE - && Character.getType(n) == Character.END_PUNCTUATION) - n = iter.previous(); - int here = iter.getIndex(); - iter.setIndex(save); - if (n == CharacterIterator.DONE || n == '!' || n == '?') - { - // Communicate location of actual end. - period = here; - break; - } - } - else if (n == '!' || n == '?') - { - // Communicate location of actual end. - period = iter.getIndex(); - break; - } - } - - return iter.getIndex(); - } - - public int previous () - { - // We want to skip over the first sentence end to the second one. - // However, at the end of the string we want the first end. - int here = iter.getIndex(); - period = here; - int first = previous_internal (); - if (here == iter.getEndIndex() || first == DONE) - return first; - iter.setIndex(period); - return previous_internal (); - } - - // This is used for communication between previous and - // previous_internal. - private int period; -} diff --git a/libjava/gnu/gcj/text/WordBreakIterator.java b/libjava/gnu/gcj/text/WordBreakIterator.java deleted file mode 100644 index 553cdefdfc1..00000000000 --- a/libjava/gnu/gcj/text/WordBreakIterator.java +++ /dev/null @@ -1,224 +0,0 @@ -// Default word BreakIterator. - -/* Copyright (C) 1999 Free Software Foundation - - This file is part of libgcj. - -This software is copyrighted work licensed under the terms of the -Libgcj License. Please consult the file "LIBGCJ_LICENSE" for -details. */ - -package gnu.gcj.text; - -import java.text.BreakIterator; -import java.text.CharacterIterator; - -/** - * @author Tom Tromey <tromey@cygnus.com> - * @date March 22, 1999 - * Written using The Unicode Standard, Version 2.0. - */ - -public class WordBreakIterator extends BaseBreakIterator -{ - public Object clone () - { - return new WordBreakIterator (this); - } - - public WordBreakIterator () - { - iter = null; - } - - private WordBreakIterator (WordBreakIterator other) - { - iter = (CharacterIterator) other.iter.clone(); - } - - // Some methods to tell us different properties of characters. - private final boolean isHira (char c) - { - return c >= 0x3040 && c <= 0x309f; - } - private final boolean isKata (char c) - { - return c >= 0x30a0 && c <= 0x30ff; - } - private final boolean isHan (char c) - { - return c >= 0x4e00 && c <= 0x9fff; - } - - public int next () - { - int end = iter.getEndIndex(); - if (iter.getIndex() == end) - return DONE; - - while (iter.getIndex() < end) - { - char c = iter.current(); - if (c == CharacterIterator.DONE) - break; - int type = Character.getType(c); - - char n = iter.next(); - if (n == CharacterIterator.DONE) - break; - - // Break after paragraph separators. - if (type == Character.PARAGRAPH_SEPARATOR - || type == Character.LINE_SEPARATOR) - break; - - // Break between letters and non-letters. - // FIXME: we treat apostrophe as part of a word. This - // is an English-ism. - boolean is_letter = Character.isLetter(c); - if (c != '\'' && ! is_letter && type != Character.NON_SPACING_MARK - && Character.isLetter(n)) - break; - - // Always break after certain symbols, such as punctuation. - // This heuristic is derived from hints in the JCL book and is - // not part of Unicode. It seems to be right, however. - // FIXME: we treat apostrophe as part of a word. This - // is an English-ism. - if (c != '\'' - && (type == Character.DASH_PUNCTUATION - || type == Character.START_PUNCTUATION - || type == Character.END_PUNCTUATION - || type == Character.CONNECTOR_PUNCTUATION - || type == Character.OTHER_PUNCTUATION - || type == Character.MATH_SYMBOL - || type == Character.CURRENCY_SYMBOL - || type == Character.MODIFIER_SYMBOL - || type == Character.OTHER_SYMBOL - || type == Character.FORMAT - || type == Character.CONTROL)) - break; - - boolean is_hira = isHira (c); - boolean is_kata = isKata (c); - boolean is_han = isHan (c); - - // Special case Japanese. - if (! is_hira && ! is_kata && ! is_han - && type != Character.NON_SPACING_MARK - && (isHira (n) || isKata (n) || isHan (n))) - break; - - if (is_hira || is_kata || is_han || is_letter) - { - // Now we need to do some lookahead. We might need to do - // quite a bit of lookahead, so we save our position and - // restore it later. - int save = iter.getIndex(); - // Skip string of non spacing marks. - while (n != CharacterIterator.DONE - && Character.getType(n) == Character.NON_SPACING_MARK) - n = iter.next(); - if (n == CharacterIterator.DONE) - break; - if ((is_hira && ! isHira (n)) - || (is_kata && ! isHira (n) && ! isKata (n)) - || (is_han && ! isHira (n) && ! isHan (n)) - // FIXME: we treat apostrophe as part of a word. This - // is an English-ism. - || (is_letter && ! Character.isLetter(n) && n != '\'')) - break; - iter.setIndex(save); - } - } - - return iter.getIndex(); - } - - public int previous () - { - int start = iter.getBeginIndex(); - if (iter.getIndex() == start) - return DONE; - - while (iter.getIndex() >= start) - { - char c = iter.previous(); - if (c == CharacterIterator.DONE) - break; - - boolean is_hira = isHira (c); - boolean is_kata = isKata (c); - boolean is_han = isHan (c); - boolean is_letter = Character.isLetter(c); - - char n = iter.previous(); - if (n == CharacterIterator.DONE) - break; - iter.next(); - int type = Character.getType(n); - // Break after paragraph separators. - if (type == Character.PARAGRAPH_SEPARATOR - || type == Character.LINE_SEPARATOR) - break; - - // Break between letters and non-letters. - // FIXME: we treat apostrophe as part of a word. This - // is an English-ism. - if (n != '\'' && ! Character.isLetter(n) - && type != Character.NON_SPACING_MARK - && is_letter) - break; - - // Always break after certain symbols, such as punctuation. - // This heuristic is derived from hints in the JCL book and is - // not part of Unicode. It seems to be right, however. - // FIXME: we treat apostrophe as part of a word. This - // is an English-ism. - if (n != '\'' - && (type == Character.DASH_PUNCTUATION - || type == Character.START_PUNCTUATION - || type == Character.END_PUNCTUATION - || type == Character.CONNECTOR_PUNCTUATION - || type == Character.OTHER_PUNCTUATION - || type == Character.MATH_SYMBOL - || type == Character.CURRENCY_SYMBOL - || type == Character.MODIFIER_SYMBOL - || type == Character.OTHER_SYMBOL - || type == Character.FORMAT - || type == Character.CONTROL)) - break; - - // Special case Japanese. - if ((is_hira || is_kata || is_han) - && ! isHira (n) && ! isKata (n) && ! isHan (n) - && type != Character.NON_SPACING_MARK) - break; - - // We might have to skip over non spacing marks to see what's - // on the other side. - if (! is_hira || (! is_letter && c != '\'')) - { - int save = iter.getIndex(); - while (n != CharacterIterator.DONE - && Character.getType(n) == Character.NON_SPACING_MARK) - n = iter.previous(); - iter.setIndex(save); - // This is a strange case: a bunch of non-spacing marks at - // the beginning. We treat the current location as a word - // break. - if (n == CharacterIterator.DONE) - break; - if ((isHira (n) && ! is_hira) - || (isKata (n) && ! is_hira && ! is_kata) - || (isHan (n) && ! is_hira && ! is_han) - // FIXME: we treat apostrophe as part of a word. This - // is an English-ism. - || (! is_letter && c != '\'' && Character.isLetter(n))) - break; - } - } - - return iter.getIndex(); - } -} |