libjava/gnu/gcj/convert/Output_UTF8.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128

/* Copyright (C) 1999, 2000, 2003  Free Software Foundation

   This file is part of libgcj.

This software is copyrighted work licensed under the terms of the
Libgcj License.  Please consult the file "LIBGCJ_LICENSE" for
details.  */

package gnu.gcj.convert;

/**
 * Convert Unicode to UTF8.
 * @author Per Bothner <bothner@cygnus.com>
 * @date Match 1999.
 */

public class Output_UTF8 extends UnicodeToBytes
{
  public String getName() { return "UTF8"; }

  /** True if a surrogate pair should be emitted as a single UTF8 sequence.
   * Otherwise, a surrogate pair is treated as two separate characters.
   * Also, '\0' is emitted as {0} if true, and as {0xC0,0x80} if false. */
  public boolean standardUTF8 = true;

  // Saves the previous char if it was a high-surrogate.
  char hi_part;
  // Value of incomplete character.
  int value;
  // Number of continuation bytes still to emit.
  int bytes_todo;

  public int write (char[] inbuffer, int inpos, int inlength)
  {
    int start_pos = inpos;
    int avail = buf.length - count;
    for (;;)
      {
	if (avail == 0 || (inlength == 0 && bytes_todo == 0))
	  break;
	// The algorithm is made more complicated because we want to write
	// at least one byte in the output buffer, if there is room for
	// that byte, and at least one input character is available.
	// This makes the code more robust, since client code will
	// always "make progress", even in the complicated cases,
	// where the output buffer only has room for only *part* of a
	// multi-byte sequence, or the input char buffer only has half
	// of a surrogate pair (when standardUTF8 is set), or both.

	// Handle continuation characters we did not have room for before.
	if (bytes_todo > 0)
	  {
	    do
	      {
		bytes_todo--;
		buf[count++] = (byte)
		  (((value >> (bytes_todo * 6)) & 0x3F) | 0x80);
		avail--;
	      }
	    while (bytes_todo > 0 && avail > 0);
	    continue;
	  }

	char ch = inbuffer[inpos++];
	inlength--;

	if ((hi_part != 0 && (ch <= 0xDBFF || ch > 0xDFFF))
	    || (hi_part == 0 && ch >= 0xDC00 && ch <= 0xDFFF))
	  {
	    // If the previous character was a high surrogate, and we
	    // don't now have a low surrogate, we print the high
	    // surrogate as an isolated character.  If this character
	    // is a low surrogate and we didn't previously see a high
	    // surrogate, we do the same thing.
	    --inpos;
	    ++inlength;
	    buf[count++] = (byte) (0xE0 | (hi_part >> 12));
	    value = hi_part;
	    hi_part = 0;
	    avail--;
	    bytes_todo = 2;
	  }
	else if (ch < 128 && (ch != 0 || standardUTF8))
	  {
	    avail--;
	    buf[count++] = (byte) ch;
	  }
	else if (ch <= 0x07FF)
	  {
	    buf[count++] = (byte) (0xC0 | (ch >> 6));
	    avail--;
	    value = ch;
	    bytes_todo = 1;
	  }
	else if (ch >= 0xD800 && ch <= 0xDFFF && standardUTF8)
	  {
	    if (ch <= 0xDBFF)  // High surrogates
	      {
		// Just save the high surrogate until the next
		// character comes along.
		hi_part = ch;
	      }
	    else // Low surrogates
	      {
		value = (hi_part - 0xD800) * 0x400 + (ch - 0xDC00) + 0x10000;
		buf[count++] = (byte) (0xF0 | (value >> 18));
		avail--;
		bytes_todo = 3;
		hi_part = 0;
	      }
	  }
	else
	  {
	    buf[count++] = (byte) (0xE0 | (ch >> 12));
	    value = ch;
	    avail--;
	    bytes_todo = 2;
	  }
      }
    return inpos - start_pos;
  }

  public boolean havePendingBytes()
  {
    return bytes_todo > 0;
  }

}