diff options
Diffstat (limited to 'gcc/java/lex.c')
-rw-r--r-- | gcc/java/lex.c | 294 |
1 files changed, 209 insertions, 85 deletions
diff --git a/gcc/java/lex.c b/gcc/java/lex.c index 6efb9071780..4179b1dbca5 100644 --- a/gcc/java/lex.c +++ b/gcc/java/lex.c @@ -24,15 +24,15 @@ of Sun Microsystems, Inc. in the United States and other countries. The Free Software Foundation is independent of Sun Microsystems, Inc. */ /* It defines java_lex (yylex) that reads a Java ASCII source file -possibly containing Unicode escape sequence or utf8 encoded characters -and returns a token for everything found but comments, white spaces -and line terminators. When necessary, it also fills the java_lval -(yylval) union. It's implemented to be called by a re-entrant parser -generated by Bison. + possibly containing Unicode escape sequence or utf8 encoded + characters and returns a token for everything found but comments, + white spaces and line terminators. When necessary, it also fills + the java_lval (yylval) union. It's implemented to be called by a + re-entrant parser generated by Bison. -The lexical analysis conforms to the Java grammar described in "The -Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele. -Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */ + The lexical analysis conforms to the Java grammar described in "The + Java(TM) Language Specification. J. Gosling, B. Joy, G. Steele. + Addison Wesley 1996" (http://java.sun.com/docs/books/jls/html/3.doc.html) */ #include "keyword.h" @@ -55,15 +55,18 @@ static int java_letter_or_digit_p PARAMS ((unicode_t)); static int java_parse_doc_section PARAMS ((unicode_t)); static void java_parse_end_comment PARAMS ((unicode_t)); static unicode_t java_get_unicode PARAMS ((void)); -static unicode_t java_read_unicode PARAMS ((int, int *)); +static unicode_t java_read_unicode PARAMS ((java_lexer *, int, int *)); static void java_store_unicode PARAMS ((struct java_line *, unicode_t, int)); -static unicode_t java_read_char PARAMS ((void)); +static unicode_t java_read_char PARAMS ((java_lexer *)); static void java_allocate_new_line PARAMS ((void)); static void java_unget_unicode PARAMS ((void)); static unicode_t java_sneak_unicode PARAMS ((void)); +java_lexer *java_new_lexer PARAMS ((FILE *, const char *)); void -java_init_lex () +java_init_lex (finput, encoding) + FILE *finput; + const char *encoding; { #ifndef JC1_LITE int java_lang_imported = 0; @@ -72,6 +75,8 @@ java_init_lex () java_lang_id = get_identifier ("java.lang"); if (!java_lang_cloneable) java_lang_cloneable = get_identifier ("java.lang.Cloneable"); + if (!java_io_serializable) + java_io_serializable = get_identifier ("java.io.Serializable"); if (!inst_id) inst_id = get_identifier ("inst$"); if (!wpv_id) @@ -112,9 +117,9 @@ java_init_lex () ctxp->lineno = lineno = 0; ctxp->p_line = NULL; ctxp->c_line = NULL; - ctxp->unget_utf8_value = 0; ctxp->minus_seen = 0; ctxp->java_error_flag = 0; + ctxp->lexer = java_new_lexer (finput, encoding); } static char * @@ -192,59 +197,180 @@ java_allocate_new_line () ctxp->c_line->white_space_only = 1; } -#define BAD_UTF8_VALUE 0xFFFE - -static unicode_t -java_read_char () +/* Create a new lexer object. */ +java_lexer * +java_new_lexer (finput, encoding) + FILE *finput; + const char *encoding; { - int c; - int c1, c2; + java_lexer *lex = (java_lexer *) xmalloc (sizeof (java_lexer)); + int enc_error = 0; + + lex->finput = finput; + lex->bs_count = 0; + lex->unget_value = 0; - if (ctxp->unget_utf8_value) +#ifdef HAVE_ICONV + lex->handle = iconv_open ("UCS-2", encoding); + if (lex->handle == (iconv_t) -1) { - int to_return = ctxp->unget_utf8_value; - ctxp->unget_utf8_value = 0; - return (to_return); + /* FIXME: we should give a nice error based on errno here. */ + enc_error = 1; } + lex->first = -1; + lex->last = -1; +#else /* HAVE_ICONV */ + if (strcmp (encoding, DEFAULT_ENCODING)) + enc_error = 1; +#endif /* HAVE_ICONV */ - c = GETC (); + if (enc_error) + fatal ("unknown encoding: `%s'", encoding); - if (c < 128) - return (unicode_t)c; - if (c == EOF) - return UEOF; - else + return lex; +} + +void +java_destroy_lexer (lex) + java_lexer *lex; +{ +#ifdef HAVE_ICONV + iconv_close (lex->handle); +#endif + free (lex); +} + +static unicode_t +java_read_char (lex) + java_lexer *lex; +{ + if (lex->unget_value) { - if ((c & 0xe0) == 0xc0) - { - c1 = GETC (); - if ((c1 & 0xc0) == 0x80) - return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f)); - c = c1; - } - else if ((c & 0xf0) == 0xe0) - { - c1 = GETC (); - if ((c1 & 0xc0) == 0x80) - { - c2 = GETC (); - if ((c2 & 0xc0) == 0x80) - return (unicode_t)(((c & 0xf) << 12) + - (( c1 & 0x3f) << 6) + (c2 & 0x3f)); - else - c = c2; - } - else - c = c1; - } - /* We looked for a UTF8 multi-byte sequence (since we saw an initial - byte with the high bit set), but found invalid bytes instead. - If the most recent byte was Ascii (and not EOF), we should - unget it, in case it was a comment terminator or other delimitor. */ - if ((c & 0x80) == 0) - UNGETC (c); - return BAD_UTF8_VALUE; + unicode_t r = lex->unget_value; + lex->unget_value = 0; + return r; } + +#ifdef HAVE_ICONV + { + char out[2]; + size_t ir, inbytesleft, in_save, out_count; + char *inp, *outp; + + while (1) + { + /* See if we need to read more data. If FIRST == 0 then the + previous conversion attempt ended in the middle of a + character at the end of the buffer. Otherwise we only have + to read if the buffer is empty. */ + if (lex->first == 0 || lex->first >= lex->last) + { + int r; + + if (lex->first >= lex->last) + { + lex->first = 0; + lex->last = 0; + } + if (feof (lex->finput)) + return UEOF; + r = fread (&lex->buffer[lex->last], 1, + sizeof (lex->buffer) - lex->last, + lex->finput); + lex->last += r; + } + + inbytesleft = lex->last - lex->first; + + if (inbytesleft == 0) + { + /* We've tried to read and there is nothing left. */ + return UEOF; + } + + in_save = inbytesleft; + out_count = 2; + inp = &lex->buffer[lex->first]; + outp = out; + ir = iconv (lex->handle, (const char **) &inp, &inbytesleft, + &outp, &out_count); + lex->first += in_save - inbytesleft; + + if (out_count == 0) + { + /* Success. We assume that UCS-2 is big-endian. This + appears to be an ok assumption. */ + unicode_t result; + result = (((unsigned char) out[0]) << 8) | (unsigned char) out[1]; + return result; + } + + if (ir == (size_t) -1) + { + if (errno == EINVAL) + { + /* This is ok. This means that the end of our buffer + is in the middle of a character sequence. We just + move the valid part of the buffer to the beginning + to force a read. */ + /* We use bcopy() because it should work for + overlapping strings. Use memmove() instead... */ + bcopy (&lex->buffer[lex->first], &lex->buffer[0], + lex->last - lex->first); + lex->last -= lex->first; + lex->first = 0; + } + else + { + /* A more serious error. */ + java_lex_error ("unrecognized character in input stream", 0); + return UEOF; + } + } + } + } +#else /* HAVE_ICONV */ + { + int c, c1, c2; + c = getc (lex->finput); + + if (c < 128) + return (unicode_t)c; + if (c == EOF) + return UEOF; + else + { + if ((c & 0xe0) == 0xc0) + { + c1 = getc (lex->finput); + if ((c1 & 0xc0) == 0x80) + return (unicode_t)(((c &0x1f) << 6) + (c1 & 0x3f)); + c = c1; + } + else if ((c & 0xf0) == 0xe0) + { + c1 = getc (lex->finput); + if ((c1 & 0xc0) == 0x80) + { + c2 = getc (lex->finput); + if ((c2 & 0xc0) == 0x80) + return (unicode_t)(((c & 0xf) << 12) + + (( c1 & 0x3f) << 6) + (c2 & 0x3f)); + else + c = c2; + } + else + c = c1; + } + + /* We simply don't support invalid characters. */ + java_lex_error ("malformed UTF-8 character", 0); + } + } +#endif /* HAVE_ICONV */ + + /* We only get here on error. */ + return UEOF; } static void @@ -265,56 +391,54 @@ java_store_unicode (l, c, unicode_escape_p) } static unicode_t -java_read_unicode (term_context, unicode_escape_p) - int term_context; - int *unicode_escape_p; +java_read_unicode (lex, term_context, unicode_escape_p) + java_lexer *lex; + int term_context; + int *unicode_escape_p; { unicode_t c; - long i, base; - c = java_read_char (); + c = java_read_char (lex); *unicode_escape_p = 0; if (c != '\\') - return ((term_context ? c : - java_lineterminator (c) ? '\n' : (unicode_t)c)); - - /* Count the number of preceeding '\' */ - for (base = ftell (finput), i = base-2; c == '\\';) - { - fseek (finput, i--, SEEK_SET); - c = java_read_char (); /* Will fail if reading utf8 stream. FIXME */ + { + lex->bs_count = 0; + return (term_context ? c : (java_lineterminator (c) + ? '\n' + : (unicode_t) c)); } - fseek (finput, base, SEEK_SET); - if ((base-i-3)%2 == 0) /* If odd number of \ seen */ + + ++lex->bs_count; + if ((lex->bs_count) % 2 == 1) { - c = java_read_char (); + /* Odd number of \ seen. */ + c = java_read_char (lex); if (c == 'u') { - unsigned short unicode = 0; + unicode_t unicode = 0; int shift = 12; /* Next should be 4 hex digits, otherwise it's an error. The hex value is converted into the unicode, pushed into the Unicode stream. */ for (shift = 12; shift >= 0; shift -= 4) { - if ((c = java_read_char ()) == UEOF) + if ((c = java_read_char (lex)) == UEOF) return UEOF; if (c >= '0' && c <= '9') unicode |= (unicode_t)((c-'0') << shift); else if ((c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) unicode |= (unicode_t)((10+(c | 0x20)-'a') << shift); else - java_lex_error - ("Non hex digit in Unicode escape sequence", 0); + java_lex_error ("Non hex digit in Unicode escape sequence", 0); } *unicode_escape_p = 1; - return (term_context ? unicode : - (java_lineterminator (c) ? '\n' : unicode)); + return (term_context + ? unicode : (java_lineterminator (c) ? '\n' : unicode)); } - ctxp->unget_utf8_value = c; + lex->unget_value = c; } - return (unicode_t)'\\'; + return (unicode_t) '\\'; } static unicode_t @@ -329,7 +453,7 @@ java_get_unicode () for (;;) { int unicode_escape_p; - c = java_read_unicode (0, &unicode_escape_p); + c = java_read_unicode (ctxp->lexer, 0, &unicode_escape_p); java_store_unicode (ctxp->c_line, c, unicode_escape_p); if (ctxp->c_line->white_space_only && !JAVA_WHITE_SPACE_P (c) && c!='\n') @@ -352,7 +476,7 @@ java_lineterminator (c) else if (c == '\r') /* CR */ { int unicode_escape_p; - c = java_read_unicode (1, &unicode_escape_p); + c = java_read_unicode (ctxp->lexer, 1, &unicode_escape_p); if (c == '\r') { /* In this case we will have another terminator. For some @@ -361,7 +485,7 @@ java_lineterminator (c) up in the actual text of the line, causing an error. So instead we choose a very low-level method. FIXME: this is incredibly ugly. */ - UNGETC (c); + ctxp->lexer->unget_value = c; } else if (c != '\n') { @@ -937,7 +1061,7 @@ java_lex (java_lval) char *string; for (no_error = 1, c = java_get_unicode (); - c != '"' && c != '\n'; c = java_get_unicode ()) + c != UEOF && c != '"' && c != '\n'; c = java_get_unicode ()) { if (c == '\\') c = java_parse_escape_sequence (); |