diff options
Diffstat (limited to 'gcc/config/i386/i386.c')
-rw-r--r-- | gcc/config/i386/i386.c | 809 |
1 files changed, 569 insertions, 240 deletions
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 2f66a01230b..ae79a13fe8e 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -84,7 +84,15 @@ struct processor_costs size_cost = { /* costs for tunning for size */ 3, /* MMX or SSE register to integer */ 0, /* size of prefetch block */ 0, /* number of parallel prefetches */ + 1, /* Branch cost */ + 2, /* cost of FADD and FSUB insns. */ + 2, /* cost of FMUL instruction. */ + 2, /* cost of FDIV instruction. */ + 2, /* cost of FABS instruction. */ + 2, /* cost of FCHS instruction. */ + 2, /* cost of FSQRT instruction. */ }; + /* Processor costs (relative to an add) */ static const struct processor_costs i386_cost = { /* 386 specific costs */ @@ -121,6 +129,13 @@ struct processor_costs i386_cost = { /* 386 specific costs */ 3, /* MMX or SSE register to integer */ 0, /* size of prefetch block */ 0, /* number of parallel prefetches */ + 1, /* Branch cost */ + 23, /* cost of FADD and FSUB insns. */ + 27, /* cost of FMUL instruction. */ + 88, /* cost of FDIV instruction. */ + 22, /* cost of FABS instruction. */ + 24, /* cost of FCHS instruction. */ + 122, /* cost of FSQRT instruction. */ }; static const @@ -158,6 +173,13 @@ struct processor_costs i486_cost = { /* 486 specific costs */ 3, /* MMX or SSE register to integer */ 0, /* size of prefetch block */ 0, /* number of parallel prefetches */ + 1, /* Branch cost */ + 8, /* cost of FADD and FSUB insns. */ + 16, /* cost of FMUL instruction. */ + 73, /* cost of FDIV instruction. */ + 3, /* cost of FABS instruction. */ + 3, /* cost of FCHS instruction. */ + 83, /* cost of FSQRT instruction. */ }; static const @@ -195,6 +217,13 @@ struct processor_costs pentium_cost = { 3, /* MMX or SSE register to integer */ 0, /* size of prefetch block */ 0, /* number of parallel prefetches */ + 2, /* Branch cost */ + 3, /* cost of FADD and FSUB insns. */ + 3, /* cost of FMUL instruction. */ + 39, /* cost of FDIV instruction. */ + 1, /* cost of FABS instruction. */ + 1, /* cost of FCHS instruction. */ + 70, /* cost of FSQRT instruction. */ }; static const @@ -232,6 +261,13 @@ struct processor_costs pentiumpro_cost = { 3, /* MMX or SSE register to integer */ 32, /* size of prefetch block */ 6, /* number of parallel prefetches */ + 2, /* Branch cost */ + 3, /* cost of FADD and FSUB insns. */ + 5, /* cost of FMUL instruction. */ + 56, /* cost of FDIV instruction. */ + 2, /* cost of FABS instruction. */ + 2, /* cost of FCHS instruction. */ + 56, /* cost of FSQRT instruction. */ }; static const @@ -269,6 +305,13 @@ struct processor_costs k6_cost = { 6, /* MMX or SSE register to integer */ 32, /* size of prefetch block */ 1, /* number of parallel prefetches */ + 1, /* Branch cost */ + 2, /* cost of FADD and FSUB insns. */ + 2, /* cost of FMUL instruction. */ + 56, /* cost of FDIV instruction. */ + 2, /* cost of FABS instruction. */ + 2, /* cost of FCHS instruction. */ + 56, /* cost of FSQRT instruction. */ }; static const @@ -285,38 +328,45 @@ struct processor_costs athlon_cost = { 8, /* "large" insn */ 9, /* MOVE_RATIO */ 4, /* cost for loading QImode using movzbl */ - {4, 5, 4}, /* cost of loading integer registers + {3, 4, 3}, /* cost of loading integer registers in QImode, HImode and SImode. Relative to reg-reg move (2). */ - {2, 3, 2}, /* cost of storing integer registers */ + {3, 4, 3}, /* cost of storing integer registers */ 4, /* cost of reg,reg fld/fst */ - {6, 6, 20}, /* cost of loading fp registers + {4, 4, 12}, /* cost of loading fp registers in SFmode, DFmode and XFmode */ - {4, 4, 16}, /* cost of loading integer registers */ + {6, 6, 8}, /* cost of loading integer registers */ 2, /* cost of moving MMX register */ - {2, 2}, /* cost of loading MMX registers + {4, 4}, /* cost of loading MMX registers in SImode and DImode */ - {2, 2}, /* cost of storing MMX registers + {4, 4}, /* cost of storing MMX registers in SImode and DImode */ 2, /* cost of moving SSE register */ - {2, 2, 8}, /* cost of loading SSE registers + {4, 4, 6}, /* cost of loading SSE registers in SImode, DImode and TImode */ - {2, 2, 8}, /* cost of storing SSE registers + {4, 4, 5}, /* cost of storing SSE registers in SImode, DImode and TImode */ - 6, /* MMX or SSE register to integer */ + 5, /* MMX or SSE register to integer */ 64, /* size of prefetch block */ 6, /* number of parallel prefetches */ + 2, /* Branch cost */ + 4, /* cost of FADD and FSUB insns. */ + 4, /* cost of FMUL instruction. */ + 24, /* cost of FDIV instruction. */ + 2, /* cost of FABS instruction. */ + 2, /* cost of FCHS instruction. */ + 35, /* cost of FSQRT instruction. */ }; static const struct processor_costs pentium4_cost = { 1, /* cost of an add instruction */ 1, /* cost of a lea instruction */ - 8, /* variable shift costs */ - 8, /* constant shift costs */ - 30, /* cost of starting a multiply */ + 4, /* variable shift costs */ + 4, /* constant shift costs */ + 15, /* cost of starting a multiply */ 0, /* cost of multiply per each bit set */ - 112, /* cost of a divide/mod */ + 56, /* cost of a divide/mod */ 1, /* cost of movsx */ 1, /* cost of movzx */ 16, /* "large" insn */ @@ -343,6 +393,13 @@ struct processor_costs pentium4_cost = { 10, /* MMX or SSE register to integer */ 64, /* size of prefetch block */ 6, /* number of parallel prefetches */ + 2, /* Branch cost */ + 5, /* cost of FADD and FSUB insns. */ + 7, /* cost of FMUL instruction. */ + 43, /* cost of FDIV instruction. */ + 2, /* cost of FABS instruction. */ + 2, /* cost of FCHS instruction. */ + 43, /* cost of FSQRT instruction. */ }; const struct processor_costs *ix86_cost = &pentium_cost; @@ -396,6 +453,13 @@ const int x86_epilogue_using_move = m_ATHLON | m_PENT4 | m_PPRO; const int x86_decompose_lea = m_PENT4; const int x86_shift1 = ~m_486; const int x86_arch_always_fancy_math_387 = m_PENT | m_PPRO | m_ATHLON | m_PENT4; +const int x86_sse_partial_reg_dependency = m_PENT4 | m_PPRO; +/* Set for machines where the type and dependencies are resolved on SSE register + parts insetad of whole registers, so we may maintain just lower part of + scalar values in proper format leaving the upper part undefined. */ +const int x86_sse_partial_regs = m_ATHLON; +const int x86_sse_typeless_stores = m_ATHLON; +const int x86_sse_load0_by_pxor = m_PPRO | m_PENT4; /* In case the avreage insn count for single function invocation is lower than this constant, emit fast (but longer) prologue and @@ -701,6 +765,10 @@ static int ix86_variable_issue PARAMS ((FILE *, int, rtx, int)); static int ia32_use_dfa_pipeline_interface PARAMS ((void)); static int ia32_multipass_dfa_lookahead PARAMS ((void)); static void ix86_init_mmx_sse_builtins PARAMS ((void)); +static rtx ia32_this_parameter PARAMS ((tree)); +static void x86_output_mi_thunk PARAMS ((FILE *, tree, HOST_WIDE_INT, tree)); +static void x86_output_mi_vcall_thunk PARAMS ((FILE *, tree, HOST_WIDE_INT, + HOST_WIDE_INT, tree)); struct ix86_address { @@ -722,8 +790,6 @@ static rtx ix86_expand_sse_compare PARAMS ((const struct builtin_description *, static rtx ix86_expand_unop1_builtin PARAMS ((enum insn_code, tree, rtx)); static rtx ix86_expand_unop_builtin PARAMS ((enum insn_code, tree, rtx, int)); static rtx ix86_expand_binop_builtin PARAMS ((enum insn_code, tree, rtx)); -static rtx ix86_expand_timode_binop_builtin PARAMS ((enum insn_code, - tree, rtx)); static rtx ix86_expand_store_builtin PARAMS ((enum insn_code, tree)); static rtx safe_vector_operand PARAMS ((rtx, enum machine_mode)); static enum rtx_code ix86_fp_compare_code_to_integer PARAMS ((enum rtx_code)); @@ -741,7 +807,9 @@ static unsigned int ix86_select_alt_pic_regnum PARAMS ((void)); static int ix86_save_reg PARAMS ((unsigned int, int)); static void ix86_compute_frame_layout PARAMS ((struct ix86_frame *)); static int ix86_comp_type_attributes PARAMS ((tree, tree)); +static int ix86_fntype_regparm PARAMS ((tree)); const struct attribute_spec ix86_attribute_table[]; +static bool ix86_function_ok_for_sibcall PARAMS ((tree, tree)); static tree ix86_handle_cdecl_attribute PARAMS ((tree *, tree, tree, int, bool *)); static tree ix86_handle_regparm_attribute PARAMS ((tree *, tree, tree, int, bool *)); static int ix86_value_regno PARAMS ((enum machine_mode)); @@ -843,11 +911,19 @@ static enum x86_64_reg_class merge_classes PARAMS ((enum x86_64_reg_class, #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \ ia32_multipass_dfa_lookahead +#undef TARGET_FUNCTION_OK_FOR_SIBCALL +#define TARGET_FUNCTION_OK_FOR_SIBCALL ix86_function_ok_for_sibcall + #ifdef HAVE_AS_TLS #undef TARGET_HAVE_TLS #define TARGET_HAVE_TLS true #endif +#undef TARGET_ASM_OUTPUT_MI_THUNK +#define TARGET_ASM_OUTPUT_MI_THUNK x86_output_mi_thunk +#undef TARGET_ASM_OUTPUT_MI_VCALL_THUNK +#define TARGET_ASM_OUTPUT_MI_VCALL_THUNK x86_output_mi_vcall_thunk + struct gcc_target targetm = TARGET_INITIALIZER; /* Sometimes certain combinations of command options do not make @@ -876,17 +952,16 @@ override_options () const int align_jump; const int align_jump_max_skip; const int align_func; - const int branch_cost; } const processor_target_table[PROCESSOR_max] = { - {&i386_cost, 0, 0, 4, 3, 4, 3, 4, 1}, - {&i486_cost, 0, 0, 16, 15, 16, 15, 16, 1}, - {&pentium_cost, 0, 0, 16, 7, 16, 7, 16, 1}, - {&pentiumpro_cost, 0, 0, 16, 15, 16, 7, 16, 1}, - {&k6_cost, 0, 0, 32, 7, 32, 7, 32, 1}, - {&athlon_cost, 0, 0, 16, 7, 64, 7, 16, 1}, - {&pentium4_cost, 0, 0, 0, 0, 0, 0, 0, 1} + {&i386_cost, 0, 0, 4, 3, 4, 3, 4}, + {&i486_cost, 0, 0, 16, 15, 16, 15, 16}, + {&pentium_cost, 0, 0, 16, 7, 16, 7, 16}, + {&pentiumpro_cost, 0, 0, 16, 15, 16, 7, 16}, + {&k6_cost, 0, 0, 32, 7, 32, 7, 32}, + {&athlon_cost, 0, 0, 16, 7, 64, 7, 16}, + {&pentium4_cost, 0, 0, 0, 0, 0, 0, 0} }; static const char * const cpu_names[] = TARGET_CPU_DEFAULT_NAMES; @@ -911,6 +986,9 @@ override_options () {"i586", PROCESSOR_PENTIUM, 0}, {"pentium", PROCESSOR_PENTIUM, 0}, {"pentium-mmx", PROCESSOR_PENTIUM, PTA_MMX}, + {"winchip-c6", PROCESSOR_I486, PTA_MMX}, + {"winchip2", PROCESSOR_I486, PTA_MMX | PTA_3DNOW}, + {"c3", PROCESSOR_I486, PTA_MMX | PTA_3DNOW}, {"i686", PROCESSOR_PENTIUMPRO, 0}, {"pentiumpro", PROCESSOR_PENTIUMPRO, 0}, {"pentium2", PROCESSOR_PENTIUMPRO, PTA_MMX}, @@ -934,6 +1012,11 @@ override_options () int const pta_size = ARRAY_SIZE (processor_alias_table); + /* By default our XFmode is the 80-bit extended format. If we have + use TFmode instead, it's also the 80-bit format, but with padding. */ + real_format_for_mode[XFmode - QFmode] = &ieee_extended_intel_96_format; + real_format_for_mode[TFmode - QFmode] = &ieee_extended_intel_128_format; + #ifdef SUBTARGET_OVERRIDE_OPTIONS SUBTARGET_OVERRIDE_OPTIONS; #endif @@ -993,19 +1076,19 @@ override_options () /* Default cpu tuning to the architecture. */ ix86_cpu = ix86_arch; if (processor_alias_table[i].flags & PTA_MMX - && !(target_flags & MASK_MMX_SET)) + && !(target_flags_explicit & MASK_MMX)) target_flags |= MASK_MMX; if (processor_alias_table[i].flags & PTA_3DNOW - && !(target_flags & MASK_3DNOW_SET)) + && !(target_flags_explicit & MASK_3DNOW)) target_flags |= MASK_3DNOW; if (processor_alias_table[i].flags & PTA_3DNOW_A - && !(target_flags & MASK_3DNOW_A_SET)) + && !(target_flags_explicit & MASK_3DNOW_A)) target_flags |= MASK_3DNOW_A; if (processor_alias_table[i].flags & PTA_SSE - && !(target_flags & MASK_SSE_SET)) + && !(target_flags_explicit & MASK_SSE)) target_flags |= MASK_SSE; if (processor_alias_table[i].flags & PTA_SSE2 - && !(target_flags & MASK_SSE2_SET)) + && !(target_flags_explicit & MASK_SSE2)) target_flags |= MASK_SSE2; if (processor_alias_table[i].flags & PTA_PREFETCH_SSE) x86_prefetch_sse = true; @@ -1112,20 +1195,20 @@ override_options () don't want additional code to keep the stack aligned when optimizing for code size. */ ix86_preferred_stack_boundary = (optimize_size - ? TARGET_64BIT ? 64 : 32 + ? TARGET_64BIT ? 128 : 32 : 128); if (ix86_preferred_stack_boundary_string) { i = atoi (ix86_preferred_stack_boundary_string); - if (i < (TARGET_64BIT ? 3 : 2) || i > 12) + if (i < (TARGET_64BIT ? 4 : 2) || i > 12) error ("-mpreferred-stack-boundary=%d is not between %d and 12", i, - TARGET_64BIT ? 3 : 2); + TARGET_64BIT ? 4 : 2); else ix86_preferred_stack_boundary = (1 << i) * BITS_PER_UNIT; } /* Validate -mbranch-cost= value, or provide default. */ - ix86_branch_cost = processor_target_table[ix86_cpu].branch_cost; + ix86_branch_cost = processor_target_table[ix86_cpu].cost->branch_cost; if (ix86_branch_cost_string) { i = atoi (ix86_branch_cost_string); @@ -1228,7 +1311,7 @@ override_options () target_flags |= MASK_3DNOW_A; } if ((x86_accumulate_outgoing_args & CPUMASK) - && !(target_flags & MASK_ACCUMULATE_OUTGOING_ARGS_SET) + && !(target_flags_explicit & MASK_ACCUMULATE_OUTGOING_ARGS) && !optimize_size) target_flags |= MASK_ACCUMULATE_OUTGOING_ARGS; @@ -1240,6 +1323,10 @@ override_options () internal_label_prefix_len = p - internal_label_prefix; *p = '\0'; } + + /* In 64-bit mode, we do not have support for vcall thunks. */ + if (TARGET_64BIT) + targetm.asm_out.output_mi_vcall_thunk = NULL; } void @@ -1284,6 +1371,65 @@ const struct attribute_spec ix86_attribute_table[] = { NULL, 0, 0, false, false, false, NULL } }; +/* If PIC, we cannot make sibling calls to global functions + because the PLT requires %ebx live. + If we are returning floats on the register stack, we cannot make + sibling calls to functions that return floats. (The stack adjust + instruction will wind up after the sibcall jump, and not be executed.) */ + +static bool +ix86_function_ok_for_sibcall (decl, exp) + tree decl; + tree exp; +{ + /* We don't have 64-bit patterns in place. */ + if (TARGET_64BIT) + return false; + + /* If we are generating position-independent code, we cannot sibcall + optimize any indirect call, or a direct call to a global function, + as the PLT requires %ebx be live. */ + if (flag_pic && (!decl || TREE_PUBLIC (decl))) + return false; + + /* If we are returning floats on the 80387 register stack, we cannot + make a sibcall from a function that doesn't return a float to a + function that does; the necessary stack adjustment will not be + executed. */ + if (TARGET_FLOAT_RETURNS_IN_80387 + && FLOAT_MODE_P (TYPE_MODE (TREE_TYPE (exp))) + && !FLOAT_MODE_P (TYPE_MODE (TREE_TYPE (TREE_TYPE (cfun->decl))))) + return false; + + /* If this call is indirect, we'll need to be able to use a call-clobbered + register for the address of the target function. Make sure that all + such registers are not used for passing parameters. */ + if (!decl && !TARGET_64BIT) + { + int regparm = ix86_regparm; + tree attr, type; + + /* We're looking at the CALL_EXPR, we need the type of the function. */ + type = TREE_OPERAND (exp, 0); /* pointer expression */ + type = TREE_TYPE (type); /* pointer type */ + type = TREE_TYPE (type); /* function type */ + + attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type)); + if (attr) + regparm = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))); + + if (regparm >= 3) + { + /* ??? Need to count the actual number of registers to be used, + not the possible number of registers. Fix later. */ + return false; + } + } + + /* Otherwise okay. That also includes certain types of indirect calls. */ + return true; +} + /* Handle a "cdecl" or "stdcall" attribute; arguments as in struct attribute_spec.handler. */ static tree @@ -1376,6 +1522,21 @@ ix86_comp_type_attributes (type1, type2) return 1; } +/* Return the regparm value for a fuctio with the indicated TYPE. */ + +static int +ix86_fntype_regparm (type) + tree type; +{ + tree attr; + + attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (type)); + if (attr) + return TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))); + else + return ix86_regparm; +} + /* Value is the number of bytes of arguments automatically popped when returning from a subroutine call. FUNDECL is the declaration node of the function (as a tree), @@ -1419,15 +1580,7 @@ ix86_return_pops_args (fundecl, funtype, size) if (aggregate_value_p (TREE_TYPE (funtype)) && !TARGET_64BIT) { - int nregs = ix86_regparm; - - if (funtype) - { - tree attr = lookup_attribute ("regparm", TYPE_ATTRIBUTES (funtype)); - - if (attr) - nregs = TREE_INT_CST_LOW (TREE_VALUE (TREE_VALUE (attr))); - } + int nregs = ix86_fntype_regparm (funtype); if (!nregs) return GET_MODE_SIZE (Pmode); @@ -1592,7 +1745,11 @@ classify_argument (mode, type, classes, bit_offset) { int bytes = (mode == BLKmode) ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode); - int words = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD; + int words = (bytes + (bit_offset % 64) / 8 + UNITS_PER_WORD - 1) / UNITS_PER_WORD; + + /* Variable sized entities are always passed/returned in memory. */ + if (bytes < 0) + return 0; if (type && AGGREGATE_TYPE_P (type)) { @@ -1850,8 +2007,7 @@ classify_argument (mode, type, classes, bit_offset) case V2SImode: case V4HImode: case V8QImode: - classes[0] = X86_64_SSE_CLASS; - return 1; + return 0; case BLKmode: case VOIDmode: return 0; @@ -2750,6 +2906,43 @@ ix86_va_arg (valist, type) return addr_rtx; } +/* Return nonzero if OP is either a i387 or SSE fp register. */ +int +any_fp_register_operand (op, mode) + rtx op; + enum machine_mode mode ATTRIBUTE_UNUSED; +{ + return ANY_FP_REG_P (op); +} + +/* Return nonzero if OP is an i387 fp register. */ +int +fp_register_operand (op, mode) + rtx op; + enum machine_mode mode ATTRIBUTE_UNUSED; +{ + return FP_REG_P (op); +} + +/* Return nonzero if OP is a non-fp register_operand. */ +int +register_and_not_any_fp_reg_operand (op, mode) + rtx op; + enum machine_mode mode; +{ + return register_operand (op, mode) && !ANY_FP_REG_P (op); +} + +/* Return nonzero of OP is a register operand other than an + i387 fp register. */ +int +register_and_not_fp_reg_operand (op, mode) + rtx op; + enum machine_mode mode; +{ + return register_operand (op, mode) && !FP_REG_P (op); +} + /* Return nonzero if OP is general operand representable on x86_64. */ int @@ -3073,6 +3266,32 @@ call_insn_operand (op, mode) return general_operand (op, Pmode); } +/* Test for a valid operand for a call instruction. Don't allow the + arg pointer register or virtual regs since they may decay into + reg + const, which the patterns can't handle. */ + +int +sibcall_insn_operand (op, mode) + rtx op; + enum machine_mode mode ATTRIBUTE_UNUSED; +{ + /* Disallow indirect through a virtual register. This leads to + compiler aborts when trying to eliminate them. */ + if (GET_CODE (op) == REG + && (op == arg_pointer_rtx + || op == frame_pointer_rtx + || (REGNO (op) >= FIRST_PSEUDO_REGISTER + && REGNO (op) <= LAST_VIRTUAL_REGISTER))) + return 0; + + /* Explicitly allow SYMBOL_REF even if pic. */ + if (GET_CODE (op) == SYMBOL_REF) + return 1; + + /* Otherwise we can only allow register operands. */ + return register_operand (op, Pmode); +} + int constant_call_address_operand (op, mode) rtx op; @@ -3213,6 +3432,30 @@ nonmemory_no_elim_operand (op, mode) return GET_CODE (op) == CONST_INT || register_operand (op, mode); } +/* Return false if this is any eliminable register or stack register, + otherwise work like register_operand. */ + +int +index_register_operand (op, mode) + register rtx op; + enum machine_mode mode; +{ + rtx t = op; + if (GET_CODE (t) == SUBREG) + t = SUBREG_REG (t); + if (!REG_P (t)) + return 0; + if (t == arg_pointer_rtx + || t == frame_pointer_rtx + || t == virtual_incoming_args_rtx + || t == virtual_stack_vars_rtx + || t == virtual_stack_dynamic_rtx + || REGNO (t) == STACK_POINTER_REGNUM) + return 0; + + return general_operand (op, mode); +} + /* Return true if op is a Q_REGS class register. */ int @@ -3959,7 +4202,7 @@ output_set_got (dest) rtx xops[3]; xops[0] = dest; - xops[1] = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_"); + xops[1] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME); if (! TARGET_DEEP_BRANCH_PREDICTION || !flag_pic) { @@ -3975,7 +4218,7 @@ output_set_got (dest) is what will be referred to by the Mach-O PIC subsystem. */ ASM_OUTPUT_LABEL (asm_out_file, machopic_function_base_name ()); #endif - ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, "L", + (*targetm.asm_out.internal_label) (asm_out_file, "L", CODE_LABEL_NUMBER (XEXP (xops[2], 0))); if (flag_pic) @@ -5009,18 +5252,14 @@ legitimate_pic_address_disp_p (disp) case UNSPEC_GOTOFF: return local_symbolic_operand (XVECEXP (disp, 0, 0), Pmode); case UNSPEC_GOTTPOFF: + case UNSPEC_GOTNTPOFF: + case UNSPEC_INDNTPOFF: if (saw_plus) return false; return initial_exec_symbolic_operand (XVECEXP (disp, 0, 0), Pmode); case UNSPEC_NTPOFF: - /* ??? Could support offset here. */ - if (saw_plus) - return false; return local_exec_symbolic_operand (XVECEXP (disp, 0, 0), Pmode); case UNSPEC_DTPOFF: - /* ??? Could support offset here. */ - if (saw_plus) - return false; return local_dynamic_symbolic_operand (XVECEXP (disp, 0, 0), Pmode); } @@ -5196,6 +5435,8 @@ legitimate_address_p (mode, addr, strict) goto is_legitimate_pic; case UNSPEC_GOTTPOFF: + case UNSPEC_GOTNTPOFF: + case UNSPEC_INDNTPOFF: case UNSPEC_NTPOFF: case UNSPEC_DTPOFF: break; @@ -5471,23 +5712,7 @@ ix86_encode_section_info (decl, first) const char *symbol_str; char *newstr; size_t len; - enum tls_model kind; - - if (!flag_pic) - { - if (local_p) - kind = TLS_MODEL_LOCAL_EXEC; - else - kind = TLS_MODEL_INITIAL_EXEC; - } - /* Local dynamic is inefficient when we're not combining the - parts of the address. */ - else if (optimize && local_p) - kind = TLS_MODEL_LOCAL_DYNAMIC; - else - kind = TLS_MODEL_GLOBAL_DYNAMIC; - if (kind < flag_tls_default) - kind = flag_tls_default; + enum tls_model kind = decl_tls_model (decl); symbol_str = XSTR (symbol, 0); @@ -5602,32 +5827,36 @@ legitimize_address (x, oldx, mode) regs_ever_live[PIC_OFFSET_TABLE_REGNUM] = 1; pic = pic_offset_table_rtx; } - else + else if (!TARGET_GNU_TLS) { pic = gen_reg_rtx (Pmode); emit_insn (gen_set_got (pic)); } + else + pic = NULL; base = get_thread_pointer (); - off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), UNSPEC_GOTTPOFF); + off = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, x), + !TARGET_GNU_TLS + ? UNSPEC_GOTTPOFF + : flag_pic ? UNSPEC_GOTNTPOFF + : UNSPEC_INDNTPOFF); off = gen_rtx_CONST (Pmode, off); - off = gen_rtx_PLUS (Pmode, pic, off); + if (flag_pic || !TARGET_GNU_TLS) + off = gen_rtx_PLUS (Pmode, pic, off); off = gen_rtx_MEM (Pmode, off); RTX_UNCHANGING_P (off) = 1; set_mem_alias_set (off, ix86_GOT_alias_set ()); - - /* Damn Sun for specifing a set of dynamic relocations without - considering the two-operand nature of the architecture! - We'd be much better off with a "GOTNTPOFF" relocation that - already contained the negated constant. */ - /* ??? Using negl and reg+reg addressing appears to be a lose - size-wise. The negl is two bytes, just like the extra movl - incurred by the two-operand subl, but reg+reg addressing - uses the two-byte modrm form, unlike plain reg. */ - dest = gen_reg_rtx (Pmode); - emit_insn (gen_subsi3 (dest, base, off)); + + if (TARGET_GNU_TLS) + { + emit_move_insn (dest, off); + return gen_rtx_PLUS (Pmode, base, dest); + } + else + emit_insn (gen_subsi3 (dest, base, off)); break; case TLS_MODEL_LOCAL_EXEC: @@ -5908,6 +6137,7 @@ output_pic_addr_const (file, x, code) fputs ("@GOTPCREL(%rip)", file); break; case UNSPEC_GOTTPOFF: + /* FIXME: This might be @TPOFF in Sun ld too. */ fputs ("@GOTTPOFF", file); break; case UNSPEC_TPOFF: @@ -5919,6 +6149,12 @@ output_pic_addr_const (file, x, code) case UNSPEC_DTPOFF: fputs ("@DTPOFF", file); break; + case UNSPEC_GOTNTPOFF: + fputs ("@GOTNTPOFF", file); + break; + case UNSPEC_INDNTPOFF: + fputs ("@INDNTPOFF", file); + break; default: output_operand_lossage ("invalid UNSPEC as operand"); break; @@ -5952,6 +6188,33 @@ i386_dwarf_output_addr_const (file, x) fputc ('\n', file); } +/* This is called from dwarf2out.c via ASM_OUTPUT_DWARF_DTPREL. + We need to emit DTP-relative relocations. */ + +void +i386_output_dwarf_dtprel (file, size, x) + FILE *file; + int size; + rtx x; +{ + switch (size) + { + case 4: + fputs (ASM_LONG, file); + break; + case 8: +#ifdef ASM_QUAD + fputs (ASM_QUAD, file); + break; +#endif + default: + abort (); + } + + output_addr_const (file, x); + fputs ("@DTPOFF", file); +} + /* In the name of slightly smaller debug output, and to cater to general assembler losage, recognize PIC+GOTOFF and turn it back into a direct symbol reference. */ @@ -6602,22 +6865,18 @@ print_operand (file, x, code) /* These float cases don't actually occur as immediate operands. */ else if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) == DFmode) { - REAL_VALUE_TYPE r; char dstr[30]; - REAL_VALUE_FROM_CONST_DOUBLE (r, x); - REAL_VALUE_TO_DECIMAL (r, "%.22e", dstr); + real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1); fprintf (file, "%s", dstr); } else if (GET_CODE (x) == CONST_DOUBLE && (GET_MODE (x) == XFmode || GET_MODE (x) == TFmode)) { - REAL_VALUE_TYPE r; char dstr[30]; - REAL_VALUE_FROM_CONST_DOUBLE (r, x); - REAL_VALUE_TO_DECIMAL (r, "%.22e", dstr); + real_to_decimal (dstr, CONST_DOUBLE_REAL_VALUE (x), sizeof (dstr), 0, 1); fprintf (file, "%s", dstr); } @@ -6702,7 +6961,8 @@ print_operand_address (file, addr) || GET_CODE (addr) == LABEL_REF || (GET_CODE (addr) == CONST && GET_CODE (XEXP (addr, 0)) == PLUS - && GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF + && (GET_CODE (XEXP (XEXP (addr, 0), 0)) == SYMBOL_REF + || GET_CODE (XEXP (XEXP (addr, 0), 0)) == LABEL_REF) && GET_CODE (XEXP (XEXP (addr, 0), 1)) == CONST_INT))) fputs ("(%rip)", file); } @@ -6801,6 +7061,7 @@ output_addr_const_extra (file, x) { case UNSPEC_GOTTPOFF: output_addr_const (file, op); + /* FIXME: This might be @TPOFF in Sun ld. */ fputs ("@GOTTPOFF", file); break; case UNSPEC_TPOFF: @@ -6815,6 +7076,14 @@ output_addr_const_extra (file, x) output_addr_const (file, op); fputs ("@DTPOFF", file); break; + case UNSPEC_GOTNTPOFF: + output_addr_const (file, op); + fputs ("@GOTNTPOFF", file); + break; + case UNSPEC_INDNTPOFF: + output_addr_const (file, op); + fputs ("@INDNTPOFF", file); + break; default: return false; @@ -7331,8 +7600,8 @@ ix86_output_addr_diff_elt (file, value, rel) machopic_function_base_name () + 1); #endif else - asm_fprintf (file, "%s%U_GLOBAL_OFFSET_TABLE_+[.-%s%d]\n", - ASM_LONG, LPREFIX, value); + asm_fprintf (file, "%s%U%s+[.-%s%d]\n", + ASM_LONG, GOT_SYMBOL_NAME, LPREFIX, value); } /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate @@ -7539,8 +7808,7 @@ ix86_expand_vector_move (mode, operands) /* Make operand1 a register if it isn't already. */ if ((reload_in_progress | reload_completed) == 0 && !register_operand (operands[0], mode) - && !register_operand (operands[1], mode) - && operands[1] != CONST0_RTX (mode)) + && !register_operand (operands[1], mode)) { rtx temp = force_reg (GET_MODE (operands[1]), operands[1]); emit_move_insn (operands[0], temp); @@ -8990,7 +9258,7 @@ ix86_expand_int_movcc (operands) emit_insn (gen_rtx_SET (VOIDmode, out, tmp)); } if (out != operands[0]) - emit_move_insn (operands[0], out); + emit_move_insn (operands[0], copy_rtx (out)); return 1; /* DONE */ } @@ -9009,12 +9277,9 @@ ix86_expand_int_movcc (operands) * This is reasonably steep, but branch mispredict costs are * high on modern cpus, so consider failing only if optimizing * for space. - * - * %%% Parameterize branch_cost on the tuning architecture, then - * use that. The 80386 couldn't care less about mispredicts. */ - if (!optimize_size && !TARGET_CMOVE) + if (!TARGET_CMOVE && BRANCH_COST >= 2) { if (cf == 0) { @@ -9092,7 +9357,7 @@ ix86_expand_int_movcc (operands) optab op; rtx var, orig_out, out, tmp; - if (optimize_size) + if (BRANCH_COST >= 2) return 0; /* FAIL */ /* If one of the two operands is an interesting constant, load a @@ -11017,13 +11282,6 @@ ix86_adjust_cost (insn, link, dep_insn, cost) memory = get_attr_memory (insn); dep_memory = get_attr_memory (dep_insn); - if (dep_memory == MEMORY_LOAD || dep_memory == MEMORY_BOTH) - { - if (dep_insn_type == TYPE_IMOV || dep_insn_type == TYPE_FMOV) - cost += 2; - else - cost += 3; - } /* Show ability of reorder buffer to hide latency of load by executing in parallel with previous instruction in case previous instruction is not needed to compute the address. */ @@ -11297,7 +11555,7 @@ ix86_variable_issue (dump, sched_verbose, insn, can_issue_more) static int ia32_use_dfa_pipeline_interface () { - if (ix86_cpu == PROCESSOR_PENTIUM) + if (ix86_cpu == PROCESSOR_PENTIUM || ix86_cpu == PROCESSOR_ATHLON) return 1; return 0; } @@ -11568,6 +11826,11 @@ x86_initialize_trampoline (tramp, fnaddr, cxt) if (offset > TRAMPOLINE_SIZE) abort (); } + +#ifdef TRANSFER_FROM_TRAMPOLINE + emit_library_call (gen_rtx (SYMBOL_REF, Pmode, "__enable_execute_stack"), + LCT_NORMAL, VOIDmode, 1, tramp, Pmode); +#endif } #define def_builtin(MASK, NAME, TYPE, CODE) \ @@ -11592,30 +11855,30 @@ struct builtin_description static const struct builtin_description bdesc_comi[] = { - { MASK_SSE1, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, EQ, 0 }, - { MASK_SSE1, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, LT, 0 }, - { MASK_SSE1, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, LE, 0 }, - { MASK_SSE1, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, LT, 1 }, - { MASK_SSE1, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, LE, 1 }, - { MASK_SSE1, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, NE, 0 }, - { MASK_SSE1, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, EQ, 0 }, - { MASK_SSE1, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, LT, 0 }, - { MASK_SSE1, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, LE, 0 }, - { MASK_SSE1, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, LT, 1 }, - { MASK_SSE1, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, LE, 1 }, - { MASK_SSE1, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, NE, 0 }, - { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, EQ, 0 }, - { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, LT, 0 }, - { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, LE, 0 }, - { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, LT, 1 }, - { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, LE, 1 }, - { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, NE, 0 }, - { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, EQ, 0 }, - { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, LT, 0 }, - { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, LE, 0 }, - { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, LT, 1 }, - { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, LE, 1 }, - { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, NE, 0 }, + { MASK_SSE1, CODE_FOR_sse_comi, "__builtin_ia32_comieq", IX86_BUILTIN_COMIEQSS, UNEQ, 0 }, + { MASK_SSE1, CODE_FOR_sse_comi, "__builtin_ia32_comilt", IX86_BUILTIN_COMILTSS, UNLT, 0 }, + { MASK_SSE1, CODE_FOR_sse_comi, "__builtin_ia32_comile", IX86_BUILTIN_COMILESS, UNLE, 0 }, + { MASK_SSE1, CODE_FOR_sse_comi, "__builtin_ia32_comigt", IX86_BUILTIN_COMIGTSS, GT, 0 }, + { MASK_SSE1, CODE_FOR_sse_comi, "__builtin_ia32_comige", IX86_BUILTIN_COMIGESS, GE, 0 }, + { MASK_SSE1, CODE_FOR_sse_comi, "__builtin_ia32_comineq", IX86_BUILTIN_COMINEQSS, LTGT, 0 }, + { MASK_SSE1, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomieq", IX86_BUILTIN_UCOMIEQSS, UNEQ, 0 }, + { MASK_SSE1, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomilt", IX86_BUILTIN_UCOMILTSS, UNLT, 0 }, + { MASK_SSE1, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomile", IX86_BUILTIN_UCOMILESS, UNLE, 0 }, + { MASK_SSE1, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomigt", IX86_BUILTIN_UCOMIGTSS, GT, 0 }, + { MASK_SSE1, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomige", IX86_BUILTIN_UCOMIGESS, GE, 0 }, + { MASK_SSE1, CODE_FOR_sse_ucomi, "__builtin_ia32_ucomineq", IX86_BUILTIN_UCOMINEQSS, LTGT, 0 }, + { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdeq", IX86_BUILTIN_COMIEQSD, UNEQ, 0 }, + { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdlt", IX86_BUILTIN_COMILTSD, UNLT, 0 }, + { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdle", IX86_BUILTIN_COMILESD, UNLE, 0 }, + { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdgt", IX86_BUILTIN_COMIGTSD, GT, 0 }, + { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdge", IX86_BUILTIN_COMIGESD, GE, 0 }, + { MASK_SSE2, CODE_FOR_sse2_comi, "__builtin_ia32_comisdneq", IX86_BUILTIN_COMINEQSD, LTGT, 0 }, + { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdeq", IX86_BUILTIN_UCOMIEQSD, UNEQ, 0 }, + { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdlt", IX86_BUILTIN_UCOMILTSD, UNLT, 0 }, + { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdle", IX86_BUILTIN_UCOMILESD, UNLE, 0 }, + { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdgt", IX86_BUILTIN_UCOMIGTSD, GT, 0 }, + { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdge", IX86_BUILTIN_UCOMIGESD, GE, 0 }, + { MASK_SSE2, CODE_FOR_sse2_ucomi, "__builtin_ia32_ucomisdneq", IX86_BUILTIN_UCOMINEQSD, LTGT, 0 }, }; static const struct builtin_description bdesc_2arg[] = @@ -11645,14 +11908,10 @@ static const struct builtin_description bdesc_2arg[] = { MASK_SSE1, CODE_FOR_vmmaskcmpv4sf3, "__builtin_ia32_cmpeqss", IX86_BUILTIN_CMPEQSS, EQ, 0 }, { MASK_SSE1, CODE_FOR_vmmaskcmpv4sf3, "__builtin_ia32_cmpltss", IX86_BUILTIN_CMPLTSS, LT, 0 }, { MASK_SSE1, CODE_FOR_vmmaskcmpv4sf3, "__builtin_ia32_cmpless", IX86_BUILTIN_CMPLESS, LE, 0 }, - { MASK_SSE1, CODE_FOR_vmmaskcmpv4sf3, "__builtin_ia32_cmpgtss", IX86_BUILTIN_CMPGTSS, LT, 1 }, - { MASK_SSE1, CODE_FOR_vmmaskcmpv4sf3, "__builtin_ia32_cmpgess", IX86_BUILTIN_CMPGESS, LE, 1 }, { MASK_SSE1, CODE_FOR_vmmaskcmpv4sf3, "__builtin_ia32_cmpunordss", IX86_BUILTIN_CMPUNORDSS, UNORDERED, 0 }, { MASK_SSE1, CODE_FOR_vmmaskncmpv4sf3, "__builtin_ia32_cmpneqss", IX86_BUILTIN_CMPNEQSS, EQ, 0 }, { MASK_SSE1, CODE_FOR_vmmaskncmpv4sf3, "__builtin_ia32_cmpnltss", IX86_BUILTIN_CMPNLTSS, LT, 0 }, { MASK_SSE1, CODE_FOR_vmmaskncmpv4sf3, "__builtin_ia32_cmpnless", IX86_BUILTIN_CMPNLESS, LE, 0 }, - { MASK_SSE1, CODE_FOR_vmmaskncmpv4sf3, "__builtin_ia32_cmpngtss", IX86_BUILTIN_CMPNGTSS, LT, 1 }, - { MASK_SSE1, CODE_FOR_vmmaskncmpv4sf3, "__builtin_ia32_cmpngess", IX86_BUILTIN_CMPNGESS, LE, 1 }, { MASK_SSE1, CODE_FOR_vmmaskncmpv4sf3, "__builtin_ia32_cmpordss", IX86_BUILTIN_CMPORDSS, UNORDERED, 0 }, { MASK_SSE1, CODE_FOR_sminv4sf3, "__builtin_ia32_minps", IX86_BUILTIN_MINPS, 0, 0 }, @@ -11660,6 +11919,11 @@ static const struct builtin_description bdesc_2arg[] = { MASK_SSE1, CODE_FOR_vmsminv4sf3, "__builtin_ia32_minss", IX86_BUILTIN_MINSS, 0, 0 }, { MASK_SSE1, CODE_FOR_vmsmaxv4sf3, "__builtin_ia32_maxss", IX86_BUILTIN_MAXSS, 0, 0 }, + { MASK_SSE1, CODE_FOR_sse_andv4sf3, "__builtin_ia32_andps", IX86_BUILTIN_ANDPS, 0, 0 }, + { MASK_SSE1, CODE_FOR_sse_nandv4sf3, "__builtin_ia32_andnps", IX86_BUILTIN_ANDNPS, 0, 0 }, + { MASK_SSE1, CODE_FOR_sse_iorv4sf3, "__builtin_ia32_orps", IX86_BUILTIN_ORPS, 0, 0 }, + { MASK_SSE1, CODE_FOR_sse_xorv4sf3, "__builtin_ia32_xorps", IX86_BUILTIN_XORPS, 0, 0 }, + { MASK_SSE1, CODE_FOR_sse_movss, "__builtin_ia32_movss", IX86_BUILTIN_MOVSS, 0, 0 }, { MASK_SSE1, CODE_FOR_sse_movhlps, "__builtin_ia32_movhlps", IX86_BUILTIN_MOVHLPS, 0, 0 }, { MASK_SSE1, CODE_FOR_sse_movlhps, "__builtin_ia32_movlhps", IX86_BUILTIN_MOVLHPS, 0, 0 }, @@ -11769,14 +12033,10 @@ static const struct builtin_description bdesc_2arg[] = { MASK_SSE2, CODE_FOR_vmmaskcmpv2df3, "__builtin_ia32_cmpeqsd", IX86_BUILTIN_CMPEQSD, EQ, 0 }, { MASK_SSE2, CODE_FOR_vmmaskcmpv2df3, "__builtin_ia32_cmpltsd", IX86_BUILTIN_CMPLTSD, LT, 0 }, { MASK_SSE2, CODE_FOR_vmmaskcmpv2df3, "__builtin_ia32_cmplesd", IX86_BUILTIN_CMPLESD, LE, 0 }, - { MASK_SSE2, CODE_FOR_vmmaskcmpv2df3, "__builtin_ia32_cmpgtsd", IX86_BUILTIN_CMPGTSD, LT, 1 }, - { MASK_SSE2, CODE_FOR_vmmaskcmpv2df3, "__builtin_ia32_cmpgesd", IX86_BUILTIN_CMPGESD, LE, 1 }, { MASK_SSE2, CODE_FOR_vmmaskcmpv2df3, "__builtin_ia32_cmpunordsd", IX86_BUILTIN_CMPUNORDSD, UNORDERED, 0 }, { MASK_SSE2, CODE_FOR_vmmaskncmpv2df3, "__builtin_ia32_cmpneqsd", IX86_BUILTIN_CMPNEQSD, EQ, 0 }, { MASK_SSE2, CODE_FOR_vmmaskncmpv2df3, "__builtin_ia32_cmpnltsd", IX86_BUILTIN_CMPNLTSD, LT, 0 }, { MASK_SSE2, CODE_FOR_vmmaskncmpv2df3, "__builtin_ia32_cmpnlesd", IX86_BUILTIN_CMPNLESD, LE, 0 }, - { MASK_SSE2, CODE_FOR_vmmaskncmpv2df3, "__builtin_ia32_cmpngtsd", IX86_BUILTIN_CMPNGTSD, LT, 1 }, - { MASK_SSE2, CODE_FOR_vmmaskncmpv2df3, "__builtin_ia32_cmpngesd", IX86_BUILTIN_CMPNGESD, LE, 1 }, { MASK_SSE2, CODE_FOR_vmmaskncmpv2df3, "__builtin_ia32_cmpordsd", IX86_BUILTIN_CMPORDSD, UNORDERED, 0 }, { MASK_SSE2, CODE_FOR_sminv2df3, "__builtin_ia32_minpd", IX86_BUILTIN_MINPD, 0, 0 }, @@ -11784,10 +12044,10 @@ static const struct builtin_description bdesc_2arg[] = { MASK_SSE2, CODE_FOR_vmsminv2df3, "__builtin_ia32_minsd", IX86_BUILTIN_MINSD, 0, 0 }, { MASK_SSE2, CODE_FOR_vmsmaxv2df3, "__builtin_ia32_maxsd", IX86_BUILTIN_MAXSD, 0, 0 }, - { MASK_SSE2, CODE_FOR_sse2_anddf3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, 0, 0 }, - { MASK_SSE2, CODE_FOR_sse2_nanddf3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, 0, 0 }, - { MASK_SSE2, CODE_FOR_sse2_iordf3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, 0, 0 }, - { MASK_SSE2, CODE_FOR_sse2_xordf3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, 0, 0 }, + { MASK_SSE2, CODE_FOR_sse2_andv2df3, "__builtin_ia32_andpd", IX86_BUILTIN_ANDPD, 0, 0 }, + { MASK_SSE2, CODE_FOR_sse2_nandv2df3, "__builtin_ia32_andnpd", IX86_BUILTIN_ANDNPD, 0, 0 }, + { MASK_SSE2, CODE_FOR_sse2_iorv2df3, "__builtin_ia32_orpd", IX86_BUILTIN_ORPD, 0, 0 }, + { MASK_SSE2, CODE_FOR_sse2_xorv2df3, "__builtin_ia32_xorpd", IX86_BUILTIN_XORPD, 0, 0 }, { MASK_SSE2, CODE_FOR_sse2_movsd, "__builtin_ia32_movsd", IX86_BUILTIN_MOVSD, 0, 0 }, { MASK_SSE2, CODE_FOR_sse2_unpckhpd, "__builtin_ia32_unpckhpd", IX86_BUILTIN_UNPCKHPD, 0, 0 }, @@ -12292,11 +12552,6 @@ ix86_init_mmx_sse_builtins () def_builtin (MASK_SSE1, "__builtin_ia32_cvttps2pi", v2si_ftype_v4sf, IX86_BUILTIN_CVTTPS2PI); def_builtin (MASK_SSE1, "__builtin_ia32_cvttss2si", int_ftype_v4sf, IX86_BUILTIN_CVTTSS2SI); - def_builtin (MASK_SSE1, "__builtin_ia32_andps", v4sf_ftype_v4sf_v4sf, IX86_BUILTIN_ANDPS); - def_builtin (MASK_SSE1, "__builtin_ia32_andnps", v4sf_ftype_v4sf_v4sf, IX86_BUILTIN_ANDNPS); - def_builtin (MASK_SSE1, "__builtin_ia32_orps", v4sf_ftype_v4sf_v4sf, IX86_BUILTIN_ORPS); - def_builtin (MASK_SSE1, "__builtin_ia32_xorps", v4sf_ftype_v4sf_v4sf, IX86_BUILTIN_XORPS); - def_builtin (MASK_SSE1 | MASK_3DNOW_A, "__builtin_ia32_pextrw", int_ftype_v4hi_int, IX86_BUILTIN_PEXTRW); def_builtin (MASK_SSE1 | MASK_3DNOW_A, "__builtin_ia32_pinsrw", v4hi_ftype_v4hi_int_int, IX86_BUILTIN_PINSRW); @@ -12444,10 +12699,12 @@ ix86_init_mmx_sse_builtins () def_builtin (MASK_SSE2, "__builtin_ia32_psraw128", v8hi_ftype_v8hi_v2di, IX86_BUILTIN_PSRAW128); def_builtin (MASK_SSE2, "__builtin_ia32_psrad128", v4si_ftype_v4si_v2di, IX86_BUILTIN_PSRAD128); + def_builtin (MASK_SSE2, "__builtin_ia32_pslldqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSLLDQI128); def_builtin (MASK_SSE2, "__builtin_ia32_psllwi128", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSLLWI128); def_builtin (MASK_SSE2, "__builtin_ia32_pslldi128", v4si_ftype_v4si_int, IX86_BUILTIN_PSLLDI128); def_builtin (MASK_SSE2, "__builtin_ia32_psllqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSLLQI128); + def_builtin (MASK_SSE2, "__builtin_ia32_psrldqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSRLDQI128); def_builtin (MASK_SSE2, "__builtin_ia32_psrlwi128", v8hi_ftype_v8hi_int, IX86_BUILTIN_PSRLWI128); def_builtin (MASK_SSE2, "__builtin_ia32_psrldi128", v4si_ftype_v4si_int, IX86_BUILTIN_PSRLDI128); def_builtin (MASK_SSE2, "__builtin_ia32_psrlqi128", v2di_ftype_v2di_int, IX86_BUILTIN_PSRLQI128); @@ -12475,7 +12732,8 @@ safe_vector_operand (x, mode) : gen_rtx_SUBREG (DImode, x, 0))); else emit_insn (gen_sse_clrv4sf (mode == V4SFmode ? x - : gen_rtx_SUBREG (V4SFmode, x, 0))); + : gen_rtx_SUBREG (V4SFmode, x, 0), + CONST0_RTX (V4SFmode))); return x; } @@ -12529,45 +12787,6 @@ ix86_expand_binop_builtin (icode, arglist, target) return target; } -/* In type_for_mode we restrict the ability to create TImode types - to hosts with 64-bit H_W_I. So we've defined the SSE logicals - to have a V4SFmode signature. Convert them in-place to TImode. */ - -static rtx -ix86_expand_timode_binop_builtin (icode, arglist, target) - enum insn_code icode; - tree arglist; - rtx target; -{ - rtx pat; - tree arg0 = TREE_VALUE (arglist); - tree arg1 = TREE_VALUE (TREE_CHAIN (arglist)); - rtx op0 = expand_expr (arg0, NULL_RTX, VOIDmode, 0); - rtx op1 = expand_expr (arg1, NULL_RTX, VOIDmode, 0); - - op0 = gen_lowpart (TImode, op0); - op1 = gen_lowpart (TImode, op1); - target = gen_reg_rtx (TImode); - - if (! (*insn_data[icode].operand[1].predicate) (op0, TImode)) - op0 = copy_to_mode_reg (TImode, op0); - if (! (*insn_data[icode].operand[2].predicate) (op1, TImode)) - op1 = copy_to_mode_reg (TImode, op1); - - /* In the commutative cases, both op0 and op1 are nonimmediate_operand, - yet one of the two must not be a memory. This is normally enforced - by expanders, but we didn't bother to create one here. */ - if (GET_CODE (op0) == MEM && GET_CODE (op1) == MEM) - op0 = copy_to_mode_reg (TImode, op0); - - pat = GEN_FCN (icode) (target, op0, op1); - if (! pat) - return 0; - emit_insn (pat); - - return gen_lowpart (V4SFmode, target); -} - /* Subroutine of ix86_expand_builtin to take care of stores. */ static rtx @@ -12765,14 +12984,14 @@ ix86_expand_sse_comi (d, arglist, target) op1 = copy_to_mode_reg (mode1, op1); op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1); - pat = GEN_FCN (d->icode) (op0, op1, op2); + pat = GEN_FCN (d->icode) (op0, op1); if (! pat) return 0; emit_insn (pat); emit_insn (gen_rtx_SET (VOIDmode, gen_rtx_STRICT_LOW_PART (VOIDmode, target), gen_rtx_fmt_ee (comparison, QImode, - gen_rtx_REG (CCmode, FLAGS_REG), + SET_DEST (pat), const0_rtx))); return SUBREG_REG (target); @@ -12913,19 +13132,6 @@ ix86_expand_builtin (exp, target, subtarget, mode, ignore) case IX86_BUILTIN_RCPSS: return ix86_expand_unop1_builtin (CODE_FOR_vmrcpv4sf2, arglist, target); - case IX86_BUILTIN_ANDPS: - return ix86_expand_timode_binop_builtin (CODE_FOR_sse_andti3, - arglist, target); - case IX86_BUILTIN_ANDNPS: - return ix86_expand_timode_binop_builtin (CODE_FOR_sse_nandti3, - arglist, target); - case IX86_BUILTIN_ORPS: - return ix86_expand_timode_binop_builtin (CODE_FOR_sse_iorti3, - arglist, target); - case IX86_BUILTIN_XORPS: - return ix86_expand_timode_binop_builtin (CODE_FOR_sse_xorti3, - arglist, target); - case IX86_BUILTIN_LOADAPS: return ix86_expand_unop_builtin (CODE_FOR_sse_movaps, arglist, target, 1); @@ -13084,6 +13290,35 @@ ix86_expand_builtin (exp, target, subtarget, mode, ignore) emit_insn (pat); return target; + case IX86_BUILTIN_PSLLDQI128: + case IX86_BUILTIN_PSRLDQI128: + icode = ( fcode == IX86_BUILTIN_PSLLDQI128 ? CODE_FOR_sse2_ashlti3 + : CODE_FOR_sse2_lshrti3); + arg0 = TREE_VALUE (arglist); + arg1 = TREE_VALUE (TREE_CHAIN (arglist)); + op0 = expand_expr (arg0, NULL_RTX, VOIDmode, 0); + op1 = expand_expr (arg1, NULL_RTX, VOIDmode, 0); + tmode = insn_data[icode].operand[0].mode; + mode1 = insn_data[icode].operand[1].mode; + mode2 = insn_data[icode].operand[2].mode; + + if (! (*insn_data[icode].operand[1].predicate) (op0, mode1)) + { + op0 = copy_to_reg (op0); + op0 = simplify_gen_subreg (mode1, op0, GET_MODE (op0), 0); + } + if (! (*insn_data[icode].operand[2].predicate) (op1, mode2)) + { + error ("shift must be an immediate"); + return const0_rtx; + } + target = gen_reg_rtx (V2DImode); + pat = GEN_FCN (icode) (simplify_gen_subreg (tmode, target, V2DImode, 0), op0, op1); + if (! pat) + return 0; + emit_insn (pat); + return target; + case IX86_BUILTIN_FEMMS: emit_insn (gen_femms ()); return NULL_RTX; @@ -13165,7 +13400,7 @@ ix86_expand_builtin (exp, target, subtarget, mode, ignore) case IX86_BUILTIN_SSE_ZERO: target = gen_reg_rtx (V4SFmode); - emit_insn (gen_sse_clrv4sf (target)); + emit_insn (gen_sse_clrv4sf (target, CONST0_RTX (V4SFmode))); return target; case IX86_BUILTIN_MMX_ZERO: @@ -13230,6 +13465,11 @@ ix86_expand_builtin (exp, target, subtarget, mode, ignore) case IX86_BUILTIN_STORERPD: return ix86_expand_store_builtin (CODE_FOR_sse2_movapd, arglist); + case IX86_BUILTIN_CLRPD: + target = gen_reg_rtx (V2DFmode); + emit_insn (gen_sse_clrv2df (target)); + return target; + case IX86_BUILTIN_MFENCE: emit_insn (gen_sse2_mfence ()); return 0; @@ -13241,9 +13481,8 @@ ix86_expand_builtin (exp, target, subtarget, mode, ignore) arg0 = TREE_VALUE (arglist); op0 = expand_expr (arg0, NULL_RTX, VOIDmode, 0); icode = CODE_FOR_sse2_clflush; - mode0 = insn_data[icode].operand[0].mode; - if (! (*insn_data[icode].operand[0].predicate) (op0, mode0)) - op0 = copy_to_mode_reg (mode0, op0); + if (! (*insn_data[icode].operand[0].predicate) (op0, Pmode)) + op0 = copy_to_mode_reg (Pmode, op0); emit_insn (gen_sse2_clflush (op0)); return 0; @@ -13402,6 +13641,8 @@ ix86_preferred_reload_class (x, class) rtx x; enum reg_class class; { + if (GET_CODE (x) == CONST_VECTOR && x != CONST0_RTX (GET_MODE (x))) + return NO_REGS; if (GET_CODE (x) == CONST_DOUBLE && GET_MODE (x) != VOIDmode) { /* SSE can't load any constant directly yet. */ @@ -13477,17 +13718,33 @@ ix86_register_move_cost (mode, class1, class2) enum reg_class class1, class2; { /* In case we require secondary memory, compute cost of the store followed - by load. In case of copying from general_purpose_register we may emit - multiple stores followed by single load causing memory size mismatch - stall. Count this as arbitarily high cost of 20. */ + by load. In order to avoid bad register allocation choices, we need + for this to be *at least* as high as the symmetric MEMORY_MOVE_COST. */ + if (ix86_secondary_memory_needed (class1, class2, mode, 0)) { - int add_cost = 0; + int cost = 1; + + cost += MAX (MEMORY_MOVE_COST (mode, class1, 0), + MEMORY_MOVE_COST (mode, class1, 1)); + cost += MAX (MEMORY_MOVE_COST (mode, class2, 0), + MEMORY_MOVE_COST (mode, class2, 1)); + + /* In case of copying from general_purpose_register we may emit multiple + stores followed by single load causing memory size mismatch stall. + Count this as arbitarily high cost of 20. */ if (CLASS_MAX_NREGS (class1, mode) > CLASS_MAX_NREGS (class2, mode)) - add_cost = 20; - return (MEMORY_MOVE_COST (mode, class1, 0) - + MEMORY_MOVE_COST (mode, class2, 1) + add_cost); + cost += 20; + + /* In the case of FP/MMX moves, the registers actually overlap, and we + have to switch modes in order to treat them differently. */ + if ((MMX_CLASS_P (class1) && MAYBE_FLOAT_CLASS_P (class2)) + || (MMX_CLASS_P (class2) && MAYBE_FLOAT_CLASS_P (class1))) + cost += 20; + + return cost; } + /* Moves between SSE/MMX and integer unit are expensive. */ if (MMX_CLASS_P (class1) != MMX_CLASS_P (class2) || SSE_CLASS_P (class1) != SSE_CLASS_P (class2)) @@ -13620,7 +13877,8 @@ ix86_memory_move_cost (mode, class, in) if (mode == TFmode) mode = XFmode; return ((in ? ix86_cost->int_load[2] : ix86_cost->int_store[2]) - * (int) GET_MODE_SIZE (mode) / 4); + * ((int) GET_MODE_SIZE (mode) + + UNITS_PER_WORD -1 ) / UNITS_PER_WORD); } } @@ -13746,27 +14004,51 @@ x86_order_regs_for_local_alloc () reg_alloc_order [pos++] = 0; } -void -x86_output_mi_thunk (file, delta, function) - FILE *file; - int delta; +/* Returns an expression indicating where the this parameter is + located on entry to the FUNCTION. */ + +static rtx +ia32_this_parameter (function) tree function; { - tree parm; - rtx xops[3]; + tree type = TREE_TYPE (function); - if (ix86_regparm > 0) - parm = TYPE_ARG_TYPES (TREE_TYPE (function)); + if (ix86_fntype_regparm (type) > 0) + { + tree parm; + + parm = TYPE_ARG_TYPES (type); + /* Figure out whether or not the function has a variable number of + arguments. */ + for (; parm; parm = TREE_CHAIN (parm))\ + if (TREE_VALUE (parm) == void_type_node) + break; + /* If not, the this parameter is in %eax. */ + if (parm) + return gen_rtx_REG (SImode, 0); + } + + if (aggregate_value_p (TREE_TYPE (type))) + return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, 8)); else - parm = NULL_TREE; - for (; parm; parm = TREE_CHAIN (parm)) - if (TREE_VALUE (parm) == void_type_node) - break; + return gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, 4)); +} + + +static void +x86_output_mi_vcall_thunk (file, thunk, delta, vcall_index, function) + FILE *file; + tree thunk ATTRIBUTE_UNUSED; + HOST_WIDE_INT delta; + HOST_WIDE_INT vcall_index; + tree function; +{ + rtx xops[3]; - xops[0] = GEN_INT (delta); if (TARGET_64BIT) { int n = aggregate_value_p (TREE_TYPE (TREE_TYPE (function))) != 0; + xops[0] = GEN_INT (delta); xops[1] = gen_rtx_REG (DImode, x86_64_int_parameter_registers[n]); output_asm_insn ("add{q} {%0, %1|%1, %0}", xops); if (flag_pic) @@ -13784,25 +14066,61 @@ x86_output_mi_thunk (file, delta, function) } else { - if (parm) - xops[1] = gen_rtx_REG (SImode, 0); - else if (aggregate_value_p (TREE_TYPE (TREE_TYPE (function)))) - xops[1] = gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, 8)); - else - xops[1] = gen_rtx_MEM (SImode, plus_constant (stack_pointer_rtx, 4)); - output_asm_insn ("add{l} {%0, %1|%1, %0}", xops); + /* Adjust the this parameter by a fixed constant. */ + if (delta) + { + xops[0] = GEN_INT (delta); + xops[1] = ia32_this_parameter (function); + output_asm_insn ("add{l}\t{%0, %1|%1, %0}", xops); + } + + /* Adjust the this parameter by a value stored in the vtable. */ + if (vcall_index) + { + rtx this_parm; + + /* Put the this parameter into %eax. */ + this_parm = ia32_this_parameter (function); + if (!REG_P (this_parm)) + { + xops[0] = this_parm; + xops[1] = gen_rtx_REG (Pmode, 0); + output_asm_insn ("mov{l}\t{%0, %1|%1, %0}", xops); + } + /* Load the virtual table pointer into %edx. */ + if (ix86_fntype_regparm (TREE_TYPE (function)) > 2) + error ("virtual function `%D' cannot have more than two register parameters", + function); + xops[0] = gen_rtx_MEM (Pmode, + gen_rtx_REG (Pmode, 0)); + xops[1] = gen_rtx_REG (Pmode, 1); + output_asm_insn ("mov{l}\t{%0, %1|%1, %0}", xops); + /* Adjust the this parameter. */ + xops[0] = gen_rtx_MEM (SImode, + plus_constant (gen_rtx_REG (Pmode, 1), + vcall_index)); + xops[1] = gen_rtx_REG (Pmode, 0); + output_asm_insn ("add{l}\t{%0, %1|%1, %0}", xops); + /* Put the this parameter back where it came from. */ + if (!REG_P (this_parm)) + { + xops[0] = gen_rtx_REG (Pmode, 0); + xops[1] = ia32_this_parameter (function); + output_asm_insn ("mov{l}\t{%0, %1|%1, %0}", xops); + } + } if (flag_pic) { xops[0] = pic_offset_table_rtx; xops[1] = gen_label_rtx (); - xops[2] = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_"); + xops[2] = gen_rtx_SYMBOL_REF (Pmode, GOT_SYMBOL_NAME); if (ix86_regparm > 2) abort (); output_asm_insn ("push{l}\t%0", xops); output_asm_insn ("call\t%P1", xops); - ASM_OUTPUT_INTERNAL_LABEL (file, "L", CODE_LABEL_NUMBER (xops[1])); + (*targetm.asm_out.internal_label) (file, "L", CODE_LABEL_NUMBER (xops[1])); output_asm_insn ("pop{l}\t%0", xops); output_asm_insn ("add{l}\t{%2+[.-%P1], %0|%0, OFFSET FLAT: %2+[.-%P1]}", xops); @@ -13814,13 +14132,24 @@ x86_output_mi_thunk (file, delta, function) } else { - fprintf (file, "\tjmp "); + fprintf (file, "\tjmp\t"); assemble_name (file, XSTR (XEXP (DECL_RTL (function), 0), 0)); fprintf (file, "\n"); } } } +static void +x86_output_mi_thunk (file, thunk, delta, function) + FILE *file; + tree thunk; + HOST_WIDE_INT delta; + tree function; +{ + x86_output_mi_vcall_thunk (file, thunk, delta, /*vcall_index=*/0, + function); +} + int x86_field_alignment (field, computed) tree field; |