1 files changed, 209 insertions, 85 deletions
diff --git a/gcc/java/lex.c b/gcc/java/lex.c
index 6efb9071780..4179b1dbca5 100644
--- a/gcc/java/lex.c
+++ b/gcc/java/lex.c
@@ -24,15 +24,15 @@ of Sun Microsystems, Inc. in the United States and other countries.
 The Free Software Foundation is independent of Sun Microsystems, Inc.  */
 
 /* It defines java_lex (yylex) that reads a Java ASCII source file
-possibly containing Unicode escape sequence or utf8 encoded characters
-and returns a token for everything found but comments, white spaces
-and line terminators. When necessary, it also fills the java_lval
-(yylval) union. It's implemented to be called by a re-entrant parser
-generated by Bison.
+   possibly containing Unicode escape sequence or utf8 encoded
+   characters and returns a token for everything found but comments,
+   white spaces and line terminators. When necessary, it also fills
+   the java_lval (yylval) union. It's implemented to be called by a
+   re-entrant parser generated by Bison.
 
-The lexical analysis conforms to the Java grammar described in "The
-Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
-Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html)  */
+   The lexical analysis conforms to the Java grammar described in "The
+   Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele.
+   Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */
 
 #include "keyword.h"
 
@@ -55,15 +55,18 @@ static int java_letter_or_digit_p PARAMS ((unicode_t));
 static int java_parse_doc_section PARAMS ((unicode_t));
 static void java_parse_end_comment PARAMS ((unicode_t));
 static unicode_t java_get_unicode PARAMS ((void));
-static unicode_t java_read_unicode PARAMS ((int, int *));
+static unicode_t java_read_unicode PARAMS ((java_lexer *, int, int *));
 static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int));
-static unicode_t java_read_char PARAMS ((void));
+static unicode_t java_read_char PARAMS ((java_lexer *));
 static void java_allocate_new_line PARAMS ((void));
 static void java_unget_unicode PARAMS ((void));
 static unicode_t java_sneak_unicode PARAMS ((void));
+java_lexer *java_new_lexer PARAMS ((FILE *, const char *));
 
 void
-java_init_lex ()
+java_init_lex (finput, encoding)
+     FILE *finput;
+     const char *encoding;
 {
 #ifndef JC1_LITE
   int java_lang_imported = 0;
@@ -72,6 +75,8 @@ java_init_lex ()
     java_lang_id = get_identifier ("java.lang");
   if (!java_lang_cloneable)
     java_lang_cloneable = get_identifier ("java.lang.Cloneable");
+  if (!java_io_serializable)
+    java_io_serializable = get_identifier ("java.io.Serializable");
   if (!inst_id)
     inst_id = get_identifier ("inst$");
   if (!wpv_id)
@@ -112,9 +117,9 @@ java_init_lex ()
   ctxp->lineno = lineno = 0;
   ctxp->p_line = NULL;
   ctxp->c_line = NULL;
-  ctxp->unget_utf8_value = 0;
   ctxp->minus_seen = 0;
   ctxp->java_error_flag = 0;
+  ctxp->lexer = java_new_lexer (finput, encoding);
 }
 
 static char *
@@ -192,59 +197,180 @@ java_allocate_new_line ()
   ctxp->c_line->white_space_only = 1;
 }
 
-#define BAD_UTF8_VALUE 0xFFFE
-
-static unicode_t
-java_read_char ()
+/* Create a new lexer object.  */
+java_lexer *
+java_new_lexer (finput, encoding)
+     FILE *finput;
+     const char *encoding;
 {
-  int c;
-  int c1, c2;
+  java_lexer *lex = (java_lexer *) xmalloc (sizeof (java_lexer));
+  int enc_error = 0;
+
+  lex->finput = finput;
+  lex->bs_count = 0;
+  lex->unget_value = 0;
 
-  if (ctxp->unget_utf8_value)
+#ifdef HAVE_ICONV
+  lex->handle = iconv_open ("UCS-2", encoding);
+  if (lex->handle == (iconv_t) -1)
     {
-      int to_return = ctxp->unget_utf8_value;
-      ctxp->unget_utf8_value = 0;
-      return (to_return);
+      /* FIXME: we should give a nice error based on errno here.  */
+      enc_error = 1;
     }
+  lex->first = -1;
+  lex->last = -1;
+#else /* HAVE_ICONV */
+  if (strcmp (encoding, DEFAULT_ENCODING))
+    enc_error = 1;
+#endif /* HAVE_ICONV */
 
-  c = GETC ();
+  if (enc_error)
+    fatal ("unknown encoding: `%s'", encoding);
 
-  if (c < 128)
-    return (unicode_t)c;
-  if (c == EOF)
-    return UEOF;
-  else
+  return lex;
+}
+
+void
+java_destroy_lexer (lex)
+     java_lexer *lex;
+{
+#ifdef HAVE_ICONV
+  iconv_close (lex->handle);
+#endif
+  free (lex);
+}
+
+static unicode_t
+java_read_char (lex)
+     java_lexer *lex;
+{
+  if (lex->unget_value)
     {
-      if ((c & 0xe0) == 0xc0)
-        {
-          c1 = GETC ();
-	  if ((c1 & 0xc0) == 0x80)
-	    return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f));
-	  c = c1;
-	}
-      else if ((c & 0xf0) == 0xe0)
-        {
-          c1 = GETC ();
-	  if ((c1 & 0xc0) == 0x80)
-	    {
-	      c2 = GETC ();
-	      if ((c2 & 0xc0) == 0x80)
-	        return (unicode_t)(((c & 0xf) << 12) + 
-				   (( c1 & 0x3f) << 6) + (c2 & 0x3f));
-	      else
-		c = c2;
-	    }
-	  else
-	    c = c1;
-	}
-      /* We looked for a UTF8 multi-byte sequence (since we saw an initial
-	 byte with the high bit set), but found invalid bytes instead.
-	 If the most recent byte was Ascii (and not EOF), we should
-	 unget it, in case it was a comment terminator or other delimitor. */
-      if ((c & 0x80) == 0)
-	UNGETC (c);
-      return BAD_UTF8_VALUE;
+      unicode_t r = lex->unget_value;
+      lex->unget_value = 0;
+      return r;
     }
+
+#ifdef HAVE_ICONV
+  {
+    char out[2];
+    size_t ir, inbytesleft, in_save, out_count;
+    char *inp, *outp;
+
+    while (1)
+      {
+	/* See if we need to read more data.  If FIRST == 0 then the
+	   previous conversion attempt ended in the middle of a
+	   character at the end of the buffer.  Otherwise we only have
+	   to read if the buffer is empty.  */
+	if (lex->first == 0 || lex->first >= lex->last)
+	  {
+	    int r;
+
+	    if (lex->first >= lex->last)
+	      {
+		lex->first = 0;
+		lex->last = 0;
+	      }
+	    if (feof (lex->finput))
+	      return UEOF;
+	    r = fread (&lex->buffer[lex->last], 1,
+		       sizeof (lex->buffer) - lex->last,
+		       lex->finput);
+	    lex->last += r;
+	  }
+
+	inbytesleft = lex->last - lex->first;
+
+	if (inbytesleft == 0)
+	  {
+	    /* We've tried to read and there is nothing left.  */
+	    return UEOF;
+	  }
+
+	in_save = inbytesleft;
+	out_count = 2;
+	inp = &lex->buffer[lex->first];
+	outp = out;
+	ir = iconv (lex->handle, (const char **) &inp, &inbytesleft,
+		    &outp, &out_count);
+	lex->first += in_save - inbytesleft;
+
+	if (out_count == 0)
+	  {
+	    /* Success.  We assume that UCS-2 is big-endian.  This
+	       appears to be an ok assumption.  */
+	    unicode_t result;
+	    result = (((unsigned char) out[0]) << 8) | (unsigned char) out[1];
+	    return result;
+	  }
+
+	if (ir == (size_t) -1)
+	  {
+	    if (errno == EINVAL)
+	      {
+		/* This is ok.  This means that the end of our buffer
+		   is in the middle of a character sequence.  We just
+		   move the valid part of the buffer to the beginning
+		   to force a read.  */
+		/* We use bcopy() because it should work for
+		   overlapping strings.  Use memmove() instead... */
+		bcopy (&lex->buffer[lex->first], &lex->buffer[0],
+		       lex->last - lex->first);
+		lex->last -= lex->first;
+		lex->first = 0;
+	      }
+	    else
+	      {
+		/* A more serious error.  */
+		java_lex_error ("unrecognized character in input stream", 0);
+		return UEOF;
+	      }
+	  }
+      }
+  }
+#else /* HAVE_ICONV */
+  {
+    int c, c1, c2;
+    c = getc (lex->finput);
+
+    if (c < 128)
+      return (unicode_t)c;
+    if (c == EOF)
+      return UEOF;
+    else
+      {
+	if ((c & 0xe0) == 0xc0)
+	  {
+	    c1 = getc (lex->finput);
+	    if ((c1 & 0xc0) == 0x80)
+	      return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f));
+	    c = c1;
+	  }
+	else if ((c & 0xf0) == 0xe0)
+	  {
+	    c1 = getc (lex->finput);
+	    if ((c1 & 0xc0) == 0x80)
+	      {
+		c2 = getc (lex->finput);
+		if ((c2 & 0xc0) == 0x80)
+		  return (unicode_t)(((c & 0xf) << 12) + 
+				     (( c1 & 0x3f) << 6) + (c2 & 0x3f));
+		else
+		  c = c2;
+	      }
+	    else
+	      c = c1;
+	  }
+
+	/* We simply don't support invalid characters.  */
+	java_lex_error ("malformed UTF-8 character", 0);
+      }
+  }
+#endif /* HAVE_ICONV */
+
+  /* We only get here on error.  */
+  return UEOF;
 }
 
 static void
@@ -265,56 +391,54 @@ java_store_unicode (l, c, unicode_escape_p)
 }
 
 static unicode_t
-java_read_unicode (term_context, unicode_escape_p)
-    int term_context;
-    int *unicode_escape_p;
+java_read_unicode (lex, term_context, unicode_escape_p)
+     java_lexer *lex;
+     int term_context;
+     int *unicode_escape_p;
 {
   unicode_t c;
-  long i, base;
 
-  c = java_read_char ();
+  c = java_read_char (lex);
   *unicode_escape_p = 0;
 
   if (c != '\\')
-    return ((term_context ? c : 
-	     java_lineterminator (c) ? '\n' : (unicode_t)c));
-
-  /* Count the number of preceeding '\' */
-  for (base = ftell (finput), i = base-2; c == '\\';)
-    { 
-      fseek (finput, i--, SEEK_SET);
-      c = java_read_char ();	/* Will fail if reading utf8 stream. FIXME */
+    {
+      lex->bs_count = 0;
+      return (term_context ? c : (java_lineterminator (c)
+				  ? '\n'
+				  : (unicode_t) c));
     }
-  fseek (finput, base, SEEK_SET);
-  if ((base-i-3)%2 == 0)	/* If odd number of \ seen */
+
+  ++lex->bs_count;
+  if ((lex->bs_count) % 2 == 1)
     {
-      c = java_read_char ();
+      /* Odd number of \ seen.  */
+      c = java_read_char (lex);
       if (c == 'u')
         {
-	  unsigned short unicode = 0;
+	  unicode_t unicode = 0;
 	  int shift = 12;
 	  /* Next should be 4 hex digits, otherwise it's an error.
 	     The hex value is converted into the unicode, pushed into
 	     the Unicode stream.  */
 	  for (shift = 12; shift >= 0; shift -= 4)
 	    {
-	      if ((c = java_read_char ()) == UEOF)
+	      if ((c = java_read_char (lex)) == UEOF)
 	        return UEOF;
 	      if (c >= '0' && c <= '9')
 		unicode |= (unicode_t)((c-'0') << shift);
 	      else if ((c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'))
 	        unicode |= (unicode_t)((10+(c | 0x20)-'a') << shift);
 	      else
-		  java_lex_error 
-		    ("Non hex digit in Unicode escape sequence", 0);
+		java_lex_error ("Non hex digit in Unicode escape sequence", 0);
 	    }
 	  *unicode_escape_p = 1;
-	  return (term_context ? unicode :
-		  (java_lineterminator (c) ? '\n' : unicode));
+	  return (term_context
+		  ? unicode : (java_lineterminator (c) ? '\n' : unicode));
 	}
-      ctxp->unget_utf8_value = c;
+      lex->unget_value = c;
     }
-  return (unicode_t)'\\';
+  return (unicode_t) '\\';
 }
 
 static unicode_t
@@ -329,7 +453,7 @@ java_get_unicode ()
 	for (;;)
 	  {
 	    int unicode_escape_p;
-	    c = java_read_unicode (0, &unicode_escape_p);
+	    c = java_read_unicode (ctxp->lexer, 0, &unicode_escape_p);
 	    java_store_unicode (ctxp->c_line, c, unicode_escape_p);
 	    if (ctxp->c_line->white_space_only 
 		&& !JAVA_WHITE_SPACE_P (c) && c!='\n')
@@ -352,7 +476,7 @@ java_lineterminator (c)
   else if (c == '\r')		/* CR */
     {
       int unicode_escape_p;
-      c = java_read_unicode (1, &unicode_escape_p);
+      c = java_read_unicode (ctxp->lexer, 1, &unicode_escape_p);
       if (c == '\r')
 	{
 	  /* In this case we will have another terminator.  For some
@@ -361,7 +485,7 @@ java_lineterminator (c)
 	     up in the actual text of the line, causing an error.  So
 	     instead we choose a very low-level method.  FIXME: this
 	     is incredibly ugly.  */
-	  UNGETC (c);
+	  ctxp->lexer->unget_value = c;
 	}
       else if (c != '\n')
 	{
@@ -937,7 +1061,7 @@ java_lex (java_lval)
       char *string;
 
       for (no_error = 1, c = java_get_unicode (); 
-	   c != '"' && c != '\n'; c = java_get_unicode ())
+	   c != UEOF && c != '"' && c != '\n'; c = java_get_unicode ())
 	{
 	  if (c == '\\')
 	    c = java_parse_escape_sequence ();