aboutsummaryrefslogtreecommitdiff
path: root/libcpp/lex.c
diff options
context:
space:
mode:
Diffstat (limited to 'libcpp/lex.c')
-rw-r--r--libcpp/lex.c384
1 files changed, 329 insertions, 55 deletions
diff --git a/libcpp/lex.c b/libcpp/lex.c
index 62a28f81b87..72c974a87e0 100644
--- a/libcpp/lex.c
+++ b/libcpp/lex.c
@@ -24,6 +24,12 @@ Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
#include "cpplib.h"
#include "internal.h"
+/* APPLE LOCAL begin CW asm blocks */
+/* A hack that would be better done with a callback or some such. */
+extern enum cw_asm_states { cw_asm_none, cw_asm_decls, cw_asm_asm } cw_asm_state;
+extern int cw_asm_in_operands;
+/* APPLE LOCAL end CW asm blocks */
+
enum spell_type
{
SPELL_OPERATOR = 0,
@@ -53,9 +59,8 @@ static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
static int skip_line_comment (cpp_reader *);
static void skip_whitespace (cpp_reader *, cppchar_t);
-static cpp_hashnode *lex_identifier (cpp_reader *, const uchar *);
-static void lex_number (cpp_reader *, cpp_string *);
-static bool forms_identifier_p (cpp_reader *, int);
+/* APPLE LOCAL mainline UCNs 2005-04-17 3892809 */
+/* Delete prototypes for lex_identifier, lex_number, forms_identifier_p */
static void lex_string (cpp_reader *, cpp_token *, const uchar *);
static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
static void create_literal (cpp_reader *, cpp_token *, const uchar *,
@@ -430,10 +435,40 @@ name_p (cpp_reader *pfile, const cpp_string *string)
return 1;
}
+/* APPLE LOCAL begin mainline UCNs 2005-04-17 3892809 */
+/* After parsing an identifier or other sequence, produce a warning about
+ sequences not in NFC/NFKC. */
+static void
+warn_about_normalization (cpp_reader *pfile,
+ const cpp_token *token,
+ const struct normalize_state *s)
+{
+ if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
+ && !pfile->state.skipping)
+ {
+ /* Make sure that the token is printed using UCNs, even
+ if we'd otherwise happily print UTF-8. */
+ unsigned char *buf = xmalloc (cpp_token_len (token));
+ size_t sz;
+
+ sz = cpp_spell_token (pfile, token, buf, false) - buf;
+ if (NORMALIZE_STATE_RESULT (s) == normalized_C)
+ cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
+ "`%.*s' is not in NFKC", (int) sz, buf);
+ else
+ cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
+ "`%.*s' is not in NFC", (int) sz, buf);
+ }
+}
+/* APPLE LOCAL end mainline UCNs 2005-04-17 3892809 */
+
/* Returns TRUE if the sequence starting at buffer->cur is invalid in
an identifier. FIRST is TRUE if this starts an identifier. */
static bool
-forms_identifier_p (cpp_reader *pfile, int first)
+/* APPLE LOCAL begin mainline UCNs 2005-04-17 3892809 */
+forms_identifier_p (cpp_reader *pfile, int first,
+ struct normalize_state *state)
+/* APPLE LOCAL end mainline UCNs 2005-04-17 3892809 */
{
cpp_buffer *buffer = pfile->buffer;
@@ -453,54 +488,76 @@ forms_identifier_p (cpp_reader *pfile, int first)
}
/* Is this a syntactically valid UCN? */
- if (0 && *buffer->cur == '\\'
+ /* APPLE LOCAL begin mainline UCNs 2005-04-17 3892809 */
+ if ((CPP_OPTION (pfile, cplusplus) || CPP_OPTION (pfile, c99))
+ && *buffer->cur == '\\'
&& (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
{
buffer->cur += 2;
- if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first))
+ if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
+ state))
return true;
buffer->cur -= 2;
}
+ /* APPLE LOCAL end mainline UCNs 2005-04-17 3892809 */
+
+ /* APPLE LOCAL begin CW asm blocks */
+ /* Allow [.+-] in CW asm opcodes (PowerPC specific). Do this here
+ so we don't have to figure out "bl- 45" vs "bl -45". */
+ if (cw_asm_state >= cw_asm_decls
+ && !cw_asm_in_operands
+ && (*buffer->cur == '.' || *buffer->cur == '+' || *buffer->cur == '-'))
+ {
+ buffer->cur++;
+ return true;
+ }
+ /* APPLE LOCAL end CW asm blocks */
return false;
}
/* Lex an identifier starting at BUFFER->CUR - 1. */
+/* APPLE LOCAL begin mainline UCNs 2005-04-17 3892809 */
static cpp_hashnode *
-lex_identifier (cpp_reader *pfile, const uchar *base)
+lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
+ struct normalize_state *nst)
{
cpp_hashnode *result;
- const uchar *cur, *limit;
+ const uchar *cur;
unsigned int len;
unsigned int hash = HT_HASHSTEP (0, *base);
cur = pfile->buffer->cur;
- for (;;)
+ if (! starts_ucn)
+ while (ISIDNUM (*cur))
+ {
+ hash = HT_HASHSTEP (hash, *cur);
+ cur++;
+ }
+ pfile->buffer->cur = cur;
+ if (starts_ucn || forms_identifier_p (pfile, false, nst))
{
- /* N.B. ISIDNUM does not include $. */
- while (ISIDNUM (*cur))
- {
- hash = HT_HASHSTEP (hash, *cur);
- cur++;
- }
-
- pfile->buffer->cur = cur;
- if (!forms_identifier_p (pfile, false))
- break;
-
- limit = pfile->buffer->cur;
- while (cur < limit)
- {
- hash = HT_HASHSTEP (hash, *cur);
- cur++;
- }
+ /* Slower version for identifiers containing UCNs (or $). */
+ do {
+ while (ISIDNUM (*pfile->buffer->cur))
+ {
+ pfile->buffer->cur++;
+ NORMALIZE_STATE_UPDATE_IDNUM (nst);
+ }
+ } while (forms_identifier_p (pfile, false, nst));
+ result = _cpp_interpret_identifier (pfile, base,
+ pfile->buffer->cur - base);
}
- len = cur - base;
- hash = HT_HASHFINISH (hash, len);
+ else
+ {
+ len = cur - base;
+ hash = HT_HASHFINISH (hash, len);
- result = (cpp_hashnode *)
- ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC);
+ result = (cpp_hashnode *)
+ ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC);
+ }
+/* APPLE LOCAL end mainline UCNs 2005-04-17 3892809 */
/* Rarely, identifiers require diagnostics when lexed. */
if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
&& !pfile->state.skipping, 0))
@@ -524,24 +581,32 @@ lex_identifier (cpp_reader *pfile, const uchar *base)
/* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
static void
-lex_number (cpp_reader *pfile, cpp_string *number)
+/* APPLE LOCAL begin mainline UCNs 2005-04-17 3892809 */
+lex_number (cpp_reader *pfile, cpp_string *number,
+ struct normalize_state *nst)
+/* APPLE LOCAL end mainline UCNs 2005-04-17 3892809 */
{
const uchar *cur;
const uchar *base;
uchar *dest;
base = pfile->buffer->cur - 1;
+/* APPLE LOCAL begin mainline UCNs 2005-04-17 3892809 */
do
{
cur = pfile->buffer->cur;
/* N.B. ISIDNUM does not include $. */
while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
- cur++;
+ {
+ cur++;
+ NORMALIZE_STATE_UPDATE_IDNUM (nst);
+ }
pfile->buffer->cur = cur;
}
- while (forms_identifier_p (pfile, false));
+ while (forms_identifier_p (pfile, false, nst));
+/* APPLE LOCAL end mainline UCNs 2005-04-17 3892809 */
number->len = cur - base;
dest = _cpp_unaligned_alloc (pfile, number->len + 1);
@@ -681,6 +746,47 @@ next_tokenrun (tokenrun *run)
return run->next;
}
+/* APPLE LOCAL begin AltiVec */
+/* Look ahead in the input stream. */
+const cpp_token *
+_cpp_peek_token (cpp_reader *pfile, int index)
+{
+ cpp_context *context = pfile->context;
+ const cpp_token *peektok;
+ int count;
+
+ /* First, scan through any pending cpp_context objects. */
+ while (context->prev)
+ {
+ ptrdiff_t sz = (context->direct_p
+ ? LAST (context).token - FIRST (context).token
+ : LAST (context).ptoken - FIRST (context).ptoken);
+
+ if (index < (int) sz)
+ return (context->direct_p
+ ? FIRST (context).token + index
+ : *(FIRST (context).ptoken + index));
+
+ index -= (int) sz;
+ context = context->prev;
+ }
+
+ /* We will have to read some new tokens after all (and do so
+ without invalidating preceding tokens). */
+ count = index;
+ pfile->keep_tokens++;
+
+ do
+ peektok = _cpp_lex_token (pfile);
+ while (index--);
+
+ _cpp_backup_tokens_direct (pfile, count + 1);
+ pfile->keep_tokens--;
+
+ return peektok;
+}
+/* APPLE LOCAL end AltiVec */
+
/* Allocate a single token that is invalidated at the same time as the
rest of the tokens on the line. Has its line and col set to the
same as the last lexed token, so that diagnostics appear in the
@@ -689,9 +795,34 @@ cpp_token *
_cpp_temp_token (cpp_reader *pfile)
{
cpp_token *old, *result;
+ /* APPLE LOCAL begin AltiVec */
+ ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
+ ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
+ /* APPLE LOCAL end AltiVec */
old = pfile->cur_token - 1;
- if (pfile->cur_token == pfile->cur_run->limit)
+ /* APPLE LOCAL begin AltiVec */
+ /* Any pre-existing lookaheads must not be clobbered. */
+ if (la)
+ {
+ if (sz <= la)
+ {
+ tokenrun *next = next_tokenrun (pfile->cur_run);
+
+ if (sz < la)
+ memmove (next->base + 1, next->base,
+ (la - sz) * sizeof (cpp_token));
+
+ next->base[0] = pfile->cur_run->limit[-1];
+ }
+
+ if (sz > 1)
+ memmove (pfile->cur_token + 1, pfile->cur_token,
+ MIN (la, sz - 1) * sizeof (cpp_token));
+ }
+
+ if (!sz)
+ /* APPLE LOCAL end AltiVec */
{
pfile->cur_run = next_tokenrun (pfile->cur_run);
pfile->cur_token = pfile->cur_run->base;
@@ -742,6 +873,11 @@ _cpp_lex_token (cpp_reader *pfile)
else
{
result = &pfile->directive_result;
+
+ /* APPLE LOCAL begin CW asm blocks C++ */
+ /* We put this back on the result. */
+ result->flags |= BOL;
+ /* APPLE LOCAL end CW asm blocks C++ */
break;
}
}
@@ -783,6 +919,12 @@ _cpp_get_fresh_line (cpp_reader *pfile)
if (!buffer->need_line)
return true;
+ /* APPLE LOCAL begin predictive compilation */
+ if (CPP_OPTION (pfile, predictive_compilation)
+ && buffer->next_line >= buffer->rlimit)
+ read_from_stdin (pfile);
+ /* APPLE LOCAL end predictive compilation */
+
if (buffer->next_line < buffer->rlimit)
{
_cpp_clean_line (pfile);
@@ -800,9 +942,14 @@ _cpp_get_fresh_line (cpp_reader *pfile)
{
/* Only warn once. */
buffer->next_line = buffer->rlimit;
- cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
- CPP_BUF_COLUMN (buffer, buffer->cur),
- "no newline at end of file");
+ /* APPLE LOCAL begin suppress no newline warning. */
+ if ( CPP_OPTION (pfile, warn_newline_at_eof))
+ {
+ cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
+ CPP_BUF_COLUMN (buffer, buffer->cur),
+ "no newline at end of file");
+ }
+ /* APPLE LOCAL end suppress no newline warning. */
}
return_at_eof = buffer->return_at_eof;
@@ -821,6 +968,10 @@ _cpp_get_fresh_line (cpp_reader *pfile)
} \
while (0)
+/* APPLE LOCAL begin CW asm blocks */
+static int cw_asm_label_follows;
+/* APPLE LOCAL end CW asm blocks */
+
/* Lex a token into pfile->cur_token, which is also incremented, to
get diagnostics pointing to the correct location.
@@ -897,9 +1048,21 @@ _cpp_lex_direct (cpp_reader *pfile)
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
- result->type = CPP_NUMBER;
- lex_number (pfile, &result->val.str);
- break;
+ /* APPLE LOCAL begin CW asm blocks */
+ /* An '@' in assembly code makes a following digit string into
+ an identifier. */
+ if (cw_asm_label_follows)
+ goto start_ident;
+ /* APPLE LOCAL end CW asm blocks */
+ /* APPLE LOCAL begin mainline UCNs 2005-04-17 3892809 */
+ {
+ struct normalize_state nst = INITIAL_NORMALIZE_STATE;
+ result->type = CPP_NUMBER;
+ lex_number (pfile, &result->val.str, &nst);
+ warn_about_normalization (pfile, result, &nst);
+ break;
+ }
+ /* APPLE LOCAL end mainline UCNs 2005-04-17 3892809 */
case 'L':
/* 'L' may introduce wide characters or strings. */
@@ -910,6 +1073,8 @@ _cpp_lex_direct (cpp_reader *pfile)
}
/* Fall through. */
+ /* APPLE LOCAL CW asm blocks */
+ start_ident:
case '_':
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
@@ -922,7 +1087,14 @@ _cpp_lex_direct (cpp_reader *pfile)
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
case 'Y': case 'Z':
result->type = CPP_NAME;
- result->val.node = lex_identifier (pfile, buffer->cur - 1);
+ /* APPLE LOCAL begin mainline UCNs 2005-04-17 3892809 */
+ {
+ struct normalize_state nst = INITIAL_NORMALIZE_STATE;
+ result->val.node = lex_identifier (pfile, buffer->cur - 1, false,
+ &nst);
+ warn_about_normalization (pfile, result, &nst);
+ }
+ /* APPLE LOCAL end mainline UCNs 2005-04-17 3892809 */
/* Convert named operators to their proper types. */
if (result->val.node->flags & NODE_OPERATOR)
@@ -930,6 +1102,10 @@ _cpp_lex_direct (cpp_reader *pfile)
result->flags |= NAMED_OP;
result->type = result->val.node->directive_index;
}
+ /* APPLE LOCAL begin CW asm blocks */
+ /* Got an identifier, reset the CW asm label hack flag. */
+ cw_asm_label_follows = 0;
+ /* APPLE LOCAL end CW asm blocks */
break;
case '\'':
@@ -1067,8 +1243,12 @@ _cpp_lex_direct (cpp_reader *pfile)
result->type = CPP_DOT;
if (ISDIGIT (*buffer->cur))
{
+ /* APPLE LOCAL begin mainline UCNs 2005-04-17 3892809 */
+ struct normalize_state nst = INITIAL_NORMALIZE_STATE;
result->type = CPP_NUMBER;
- lex_number (pfile, &result->val.str);
+ lex_number (pfile, &result->val.str, &nst);
+ warn_about_normalization (pfile, result, &nst);
+ /* APPLE LOCAL end mainline UCNs 2005-04-17 3892809 */
}
else if (*buffer->cur == '.' && buffer->cur[1] == '.')
buffer->cur += 2, result->type = CPP_ELLIPSIS;
@@ -1142,22 +1322,40 @@ _cpp_lex_direct (cpp_reader *pfile)
case ']': result->type = CPP_CLOSE_SQUARE; break;
case '{': result->type = CPP_OPEN_BRACE; break;
case '}': result->type = CPP_CLOSE_BRACE; break;
- case ';': result->type = CPP_SEMICOLON; break;
-
- /* @ is a punctuator in Objective-C. */
- case '@': result->type = CPP_ATSIGN; break;
+ /* APPLE LOCAL begin CW asm blocks */
+ case ';':
+ /* ';' separates instructions in CW asm, so flag that we're no
+ longer seeing operands. */
+ if (cw_asm_state >= cw_asm_decls)
+ cw_asm_in_operands = 0;
+ result->type = CPP_SEMICOLON;
+ break;
+ case '@':
+ /* In CW asm, @ can indicate a label, which may consist of
+ either letters or digits, so set a hack flag for this. (We
+ still want to return the @ as a separate token so that the
+ parser can distinguish labels from opcodes.) */
+ if (cw_asm_state >= cw_asm_decls)
+ cw_asm_label_follows = 1;
+ result->type = CPP_ATSIGN;
+ break;
+ /* APPLE LOCAL end CW asm blocks */
case '$':
case '\\':
{
const uchar *base = --buffer->cur;
+ /* APPLE LOCAL begin mainline UCNs 2005-04-17 3892809 */
+ struct normalize_state nst = INITIAL_NORMALIZE_STATE;
- if (forms_identifier_p (pfile, true))
+ if (forms_identifier_p (pfile, true, &nst))
{
result->type = CPP_NAME;
- result->val.node = lex_identifier (pfile, base);
+ result->val.node = lex_identifier (pfile, base, true, &nst);
+ warn_about_normalization (pfile, result, &nst);
break;
}
+ /* APPLE LOCAL end mainline UCNs 2005-04-17 3892809 */
buffer->cur++;
}
@@ -1180,19 +1378,59 @@ cpp_token_len (const cpp_token *token)
{
default: len = 4; break;
case SPELL_LITERAL: len = token->val.str.len; break;
- case SPELL_IDENT: len = NODE_LEN (token->val.node); break;
+ /* APPLE LOCAL mainline UCNs 2005-04-17 3892809 */
+ case SPELL_IDENT: len = NODE_LEN (token->val.node) * 10; break;
}
return len;
}
+/* APPLE LOCAL begin mainline UCNs 2005-04-17 3892809 */
+/* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
+ Return the number of bytes read out of NAME. (There are always
+ 10 bytes written to BUFFER.) */
+
+static size_t
+utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
+{
+ int j;
+ int ucn_len = 0;
+ int ucn_len_c;
+ unsigned t;
+ unsigned long utf32;
+
+ /* Compute the length of the UTF-8 sequence. */
+ for (t = *name; t & 0x80; t <<= 1)
+ ucn_len++;
+
+ utf32 = *name & (0x7F >> ucn_len);
+ for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
+ {
+ utf32 = (utf32 << 6) | (*++name & 0x3F);
+
+ /* Ill-formed UTF-8. */
+ if ((*name & ~0x3F) != 0x80)
+ abort ();
+ }
+
+ *buffer++ = '\\';
+ *buffer++ = 'U';
+ for (j = 7; j >= 0; j--)
+ *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
+ return ucn_len;
+}
+
+
/* Write the spelling of a token TOKEN to BUFFER. The buffer must
already contain the enough space to hold the token's spelling.
Returns a pointer to the character after the last character written.
+ FORSTRING is true if this is to be the spelling after translation
+ phase 1 (this is different for UCNs).
FIXME: Would be nice if we didn't need the PFILE argument. */
unsigned char *
cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
- unsigned char *buffer)
+ unsigned char *buffer, bool forstring)
+/* APPLE LOCAL end mainline UCNs 2005-04-17 3892809 */
{
switch (TOKEN_SPELL (token))
{
@@ -1216,8 +1454,28 @@ cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
spell_ident:
case SPELL_IDENT:
- memcpy (buffer, NODE_NAME (token->val.node), NODE_LEN (token->val.node));
- buffer += NODE_LEN (token->val.node);
+/* APPLE LOCAL begin mainline UCNs 2005-04-17 3892809 */
+ if (forstring)
+ {
+ memcpy (buffer, NODE_NAME (token->val.node),
+ NODE_LEN (token->val.node));
+ buffer += NODE_LEN (token->val.node);
+ }
+ else
+ {
+ size_t i;
+ const unsigned char * name = NODE_NAME (token->val.node);
+
+ for (i = 0; i < NODE_LEN (token->val.node); i++)
+ if (name[i] & ~0x7F)
+ {
+ i += utf8_to_ucn (buffer, name + i) - 1;
+ buffer += 10;
+ }
+ else
+ *buffer++ = NODE_NAME (token->val.node)[i];
+ }
+/* APPLE LOCAL end mainline UCNs 2005-04-17 3892809 */
break;
case SPELL_LITERAL:
@@ -1242,7 +1500,8 @@ cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
unsigned int len = cpp_token_len (token) + 1;
unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
- end = cpp_spell_token (pfile, token, start);
+/* APPLE LOCAL mainline UCNs 2005-04-17 3892809 */
+ end = cpp_spell_token (pfile, token, start, false);
end[0] = '\0';
return start;
@@ -1286,9 +1545,24 @@ cpp_output_token (const cpp_token *token, FILE *fp)
spell_ident:
case SPELL_IDENT:
- fwrite (NODE_NAME (token->val.node), 1, NODE_LEN (token->val.node), fp);
- break;
+/* APPLE LOCAL begin mainline UCNs 2005-04-17 3892809 */
+ {
+ size_t i;
+ const unsigned char * name = NODE_NAME (token->val.node);
+
+ for (i = 0; i < NODE_LEN (token->val.node); i++)
+ if (name[i] & ~0x7F)
+ {
+ unsigned char buffer[10];
+ i += utf8_to_ucn (buffer, name + i) - 1;
+ fwrite (buffer, 1, 10, fp);
+ }
+ else
+ fputc (NODE_NAME (token->val.node)[i], fp);
+ }
+ break;
+/* APPLE LOCAL end mainline UCNs 2005-04-17 3892809 */
case SPELL_LITERAL:
fwrite (token->val.str.text, 1, token->val.str.len, fp);
break;