aboutsummaryrefslogtreecommitdiff
path: root/libgo/go/strings/strings.go
diff options
context:
space:
mode:
Diffstat (limited to 'libgo/go/strings/strings.go')
-rw-r--r--libgo/go/strings/strings.go274
1 files changed, 211 insertions, 63 deletions
diff --git a/libgo/go/strings/strings.go b/libgo/go/strings/strings.go
index 60a281a6ac5..0c836c09d46 100644
--- a/libgo/go/strings/strings.go
+++ b/libgo/go/strings/strings.go
@@ -72,22 +72,20 @@ func hashStrRev(sep string) (uint32, uint32) {
return hash, pow
}
-// Count counts the number of non-overlapping instances of sep in s.
-// If sep is an empty string, Count returns 1 + the number of Unicode code points in s.
-func Count(s, sep string) int {
- n := 0
- // special cases
- if len(sep) == 0 {
+// countGeneric implements Count.
+func countGeneric(s, substr string) int {
+ // special case
+ if len(substr) == 0 {
return utf8.RuneCountInString(s) + 1
}
- offset := 0
+ n := 0
for {
- i := Index(s[offset:], sep)
+ i := Index(s, substr)
if i == -1 {
return n
}
n++
- offset += i + len(sep)
+ s = s[i+len(substr):]
}
}
@@ -106,16 +104,16 @@ func ContainsRune(s string, r rune) bool {
return IndexRune(s, r) >= 0
}
-// LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s.
-func LastIndex(s, sep string) int {
- n := len(sep)
+// LastIndex returns the index of the last instance of substr in s, or -1 if substr is not present in s.
+func LastIndex(s, substr string) int {
+ n := len(substr)
switch {
case n == 0:
return len(s)
case n == 1:
- return LastIndexByte(s, sep[0])
+ return LastIndexByte(s, substr[0])
case n == len(s):
- if sep == s {
+ if substr == s {
return 0
}
return -1
@@ -123,20 +121,20 @@ func LastIndex(s, sep string) int {
return -1
}
// Rabin-Karp search from the end of the string
- hashsep, pow := hashStrRev(sep)
+ hashss, pow := hashStrRev(substr)
last := len(s) - n
var h uint32
for i := len(s) - 1; i >= last; i-- {
h = h*primeRK + uint32(s[i])
}
- if h == hashsep && s[last:] == sep {
+ if h == hashss && s[last:] == substr {
return last
}
for i := last - 1; i >= 0; i-- {
h *= primeRK
h += uint32(s[i])
h -= pow * uint32(s[i+n])
- if h == hashsep && s[i:i+n] == sep {
+ if h == hashss && s[i:i+n] == substr {
return i
}
}
@@ -240,61 +238,187 @@ func genSplit(s, sep string, sepSave, n int) []string {
if n < 0 {
n = Count(s, sep) + 1
}
- c := sep[0]
- start := 0
+
a := make([]string, n)
- na := 0
- for i := 0; i+len(sep) <= len(s) && na+1 < n; i++ {
- if s[i] == c && (len(sep) == 1 || s[i:i+len(sep)] == sep) {
- a[na] = s[start : i+sepSave]
- na++
- start = i + len(sep)
- i += len(sep) - 1
- }
+ n--
+ i := 0
+ for i < n {
+ m := Index(s, sep)
+ if m < 0 {
+ break
+ }
+ a[i] = s[:m+sepSave]
+ s = s[m+len(sep):]
+ i++
}
- a[na] = s[start:]
- return a[0 : na+1]
+ a[i] = s
+ return a[:i+1]
}
// SplitN slices s into substrings separated by sep and returns a slice of
// the substrings between those separators.
-// If sep is empty, SplitN splits after each UTF-8 sequence.
+//
// The count determines the number of substrings to return:
// n > 0: at most n substrings; the last substring will be the unsplit remainder.
// n == 0: the result is nil (zero substrings)
// n < 0: all substrings
+//
+// Edge cases for s and sep (for example, empty strings) are handled
+// as described in the documentation for Split.
func SplitN(s, sep string, n int) []string { return genSplit(s, sep, 0, n) }
// SplitAfterN slices s into substrings after each instance of sep and
// returns a slice of those substrings.
-// If sep is empty, SplitAfterN splits after each UTF-8 sequence.
+//
// The count determines the number of substrings to return:
// n > 0: at most n substrings; the last substring will be the unsplit remainder.
// n == 0: the result is nil (zero substrings)
// n < 0: all substrings
+//
+// Edge cases for s and sep (for example, empty strings) are handled
+// as described in the documentation for SplitAfter.
func SplitAfterN(s, sep string, n int) []string {
return genSplit(s, sep, len(sep), n)
}
// Split slices s into all substrings separated by sep and returns a slice of
// the substrings between those separators.
-// If sep is empty, Split splits after each UTF-8 sequence.
+//
+// If s does not contain sep and sep is not empty, Split returns a
+// slice of length 1 whose only element is s.
+//
+// If sep is empty, Split splits after each UTF-8 sequence. If both s
+// and sep are empty, Split returns an empty slice.
+//
// It is equivalent to SplitN with a count of -1.
func Split(s, sep string) []string { return genSplit(s, sep, 0, -1) }
// SplitAfter slices s into all substrings after each instance of sep and
// returns a slice of those substrings.
-// If sep is empty, SplitAfter splits after each UTF-8 sequence.
+//
+// If s does not contain sep and sep is not empty, SplitAfter returns
+// a slice of length 1 whose only element is s.
+//
+// If sep is empty, SplitAfter splits after each UTF-8 sequence. If
+// both s and sep are empty, SplitAfter returns an empty slice.
+//
// It is equivalent to SplitAfterN with a count of -1.
func SplitAfter(s, sep string) []string {
return genSplit(s, sep, len(sep), -1)
}
+var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
+
// Fields splits the string s around each instance of one or more consecutive white space
// characters, as defined by unicode.IsSpace, returning an array of substrings of s or an
// empty list if s contains only white space.
func Fields(s string) []string {
- return FieldsFunc(s, unicode.IsSpace)
+ // First count the fields.
+ // This is an exact count if s is ASCII, otherwise it is an approximation.
+ n := 0
+ wasSpace := 1
+ // setBits is used to track which bits are set in the bytes of s.
+ setBits := uint8(0)
+ for i := 0; i < len(s); i++ {
+ r := s[i]
+ setBits |= r
+ isSpace := int(asciiSpace[r])
+ n += wasSpace & ^isSpace
+ wasSpace = isSpace
+ }
+
+ if setBits < utf8.RuneSelf { // ASCII fast path
+ a := make([]string, n)
+ na := 0
+ fieldStart := 0
+ i := 0
+ // Skip spaces in the front of the input.
+ for i < len(s) && asciiSpace[s[i]] != 0 {
+ i++
+ }
+ fieldStart = i
+ for i < len(s) {
+ if asciiSpace[s[i]] == 0 {
+ i++
+ continue
+ }
+ a[na] = s[fieldStart:i]
+ na++
+ i++
+ // Skip spaces in between fields.
+ for i < len(s) && asciiSpace[s[i]] != 0 {
+ i++
+ }
+ fieldStart = i
+ }
+ if fieldStart < len(s) { // Last field might end at EOF.
+ a[na] = s[fieldStart:]
+ }
+ return a
+ }
+
+ // Some runes in the input string are not ASCII.
+ // Same general approach as in the ASCII path but
+ // uses DecodeRuneInString and unicode.IsSpace if
+ // a non-ASCII rune needs to be decoded and checked
+ // if it corresponds to a space.
+ a := make([]string, 0, n)
+ fieldStart := 0
+ i := 0
+ // Skip spaces in the front of the input.
+ for i < len(s) {
+ if c := s[i]; c < utf8.RuneSelf {
+ if asciiSpace[c] == 0 {
+ break
+ }
+ i++
+ } else {
+ r, w := utf8.DecodeRuneInString(s[i:])
+ if !unicode.IsSpace(r) {
+ break
+ }
+ i += w
+ }
+ }
+ fieldStart = i
+ for i < len(s) {
+ if c := s[i]; c < utf8.RuneSelf {
+ if asciiSpace[c] == 0 {
+ i++
+ continue
+ }
+ a = append(a, s[fieldStart:i])
+ i++
+ } else {
+ r, w := utf8.DecodeRuneInString(s[i:])
+ if !unicode.IsSpace(r) {
+ i += w
+ continue
+ }
+ a = append(a, s[fieldStart:i])
+ i += w
+ }
+ // Skip spaces in between fields.
+ for i < len(s) {
+ if c := s[i]; c < utf8.RuneSelf {
+ if asciiSpace[c] == 0 {
+ break
+ }
+ i++
+ } else {
+ r, w := utf8.DecodeRuneInString(s[i:])
+ if !unicode.IsSpace(r) {
+ break
+ }
+ i += w
+ }
+ }
+ fieldStart = i
+ }
+ if fieldStart < len(s) { // Last field might end at EOF.
+ a = append(a, s[fieldStart:])
+ }
+ return a
}
// FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c)
@@ -383,40 +507,71 @@ func Map(mapping func(rune) rune, s string) string {
// In the worst case, the string can grow when mapped, making
// things unpleasant. But it's so rare we barge in assuming it's
// fine. It could also shrink but that falls out naturally.
- maxbytes := len(s) // length of b
- nbytes := 0 // number of bytes encoded in b
+
// The output buffer b is initialized on demand, the first
// time a character differs.
var b []byte
+ // nbytes is the number of bytes encoded in b.
+ var nbytes int
for i, c := range s {
r := mapping(c)
- if b == nil {
- if r == c {
- continue
- }
- b = make([]byte, maxbytes)
- nbytes = copy(b, s[:i])
+ if r == c {
+ continue
}
+
+ b = make([]byte, len(s)+utf8.UTFMax)
+ nbytes = copy(b, s[:i])
if r >= 0 {
- wid := 1
- if r >= utf8.RuneSelf {
- wid = utf8.RuneLen(r)
+ if r <= utf8.RuneSelf {
+ b[nbytes] = byte(r)
+ nbytes++
+ } else {
+ nbytes += utf8.EncodeRune(b[nbytes:], r)
}
- if nbytes+wid > maxbytes {
- // Grow the buffer.
- maxbytes = maxbytes*2 + utf8.UTFMax
- nb := make([]byte, maxbytes)
- copy(nb, b[0:nbytes])
- b = nb
- }
- nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r)
}
+
+ if c == utf8.RuneError {
+ // RuneError is the result of either decoding
+ // an invalid sequence or '\uFFFD'. Determine
+ // the correct number of bytes we need to advance.
+ _, w := utf8.DecodeRuneInString(s[i:])
+ i += w
+ } else {
+ i += utf8.RuneLen(c)
+ }
+
+ s = s[i:]
+ break
}
+
if b == nil {
return s
}
- return string(b[0:nbytes])
+
+ for _, c := range s {
+ r := mapping(c)
+
+ // common case
+ if (0 <= r && r <= utf8.RuneSelf) && nbytes < len(b) {
+ b[nbytes] = byte(r)
+ nbytes++
+ continue
+ }
+
+ // b is not big enough or r is not a ASCII rune.
+ if r >= 0 {
+ if nbytes+utf8.UTFMax >= len(b) {
+ // Grow the buffer.
+ nb := make([]byte, 2*len(b))
+ copy(nb, b[:nbytes])
+ b = nb
+ }
+ nbytes += utf8.EncodeRune(b[nbytes:], r)
+ }
+ }
+
+ return string(b[:nbytes])
}
// Repeat returns a new string consisting of count copies of the string s.
@@ -561,17 +716,10 @@ func LastIndexFunc(s string, f func(rune) bool) int {
// truth==false, the sense of the predicate function is
// inverted.
func indexFunc(s string, f func(rune) bool, truth bool) int {
- start := 0
- for start < len(s) {
- wid := 1
- r := rune(s[start])
- if r >= utf8.RuneSelf {
- r, wid = utf8.DecodeRuneInString(s[start:])
- }
+ for i, r := range s {
if f(r) == truth {
- return start
+ return i
}
- start += wid
}
return -1
}