aboutsummaryrefslogtreecommitdiff
path: root/gcc/brig/brigfrontend/brig-basic-inst-handler.cc
blob: 8288d6655132116d86d3bcc7ea2374b51b3c1d27 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
/* brig-basic-inst-handler.cc -- brig basic instruction handling
   Copyright (C) 2016-2017 Free Software Foundation, Inc.
   Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
   for General Processor Tech.

   This file is part of GCC.

   GCC is free software; you can redistribute it and/or modify it under
   the terms of the GNU General Public License as published by the Free
   Software Foundation; either version 3, or (at your option) any later
   version.

   GCC is distributed in the hope that it will be useful, but WITHOUT ANY
   WARRANTY; without even the implied warranty of MERCHANTABILITY or
   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   for more details.

   You should have received a copy of the GNU General Public License
   along with GCC; see the file COPYING3.  If not see
   <http://www.gnu.org/licenses/>.  */

#include <sstream>

#include "brig-code-entry-handler.h"
#include "brig-util.h"

#include "errors.h"
#include "gimple-expr.h"
#include "convert.h"
#include "print-tree.h"
#include "tree-pretty-print.h"
#include "langhooks.h"
#include "stor-layout.h"
#include "diagnostic-core.h"
#include "brig-builtins.h"
#include "fold-const.h"

brig_basic_inst_handler::brig_basic_inst_handler (brig_to_generic &parent)
  : brig_code_entry_handler (parent)
{
}

class scalarized_sat_arithmetics : public tree_element_binary_visitor
{
public:
  scalarized_sat_arithmetics (const BrigInstBase &brig_inst)
    : m_brig_inst (brig_inst)
  {
    BrigType16_t element_type = brig_inst.type & BRIG_TYPE_BASE_MASK;

#undef DEF_HSAIL_SAT_BUILTIN
#undef DEF_HSAIL_BUILTIN
#undef DEF_HSAIL_ATOMIC_BUILTIN
#undef DEF_HSAIL_INTR_BUILTIN
#undef DEF_HSAIL_CVT_ZEROI_SAT_BUILTIN

#define DEF_HSAIL_SAT_BUILTIN(ENUM, BRIG_OPCODE, HSAIL_TYPE,		\
			      NAME, TYPE, ATTRS)			\
    if (brig_inst.opcode == BRIG_OPCODE && element_type == HSAIL_TYPE)	\
      m_builtin = builtin_decl_explicit (ENUM);				\
    else
#include "brig-builtins.def"
      gcc_unreachable ();
  }

  virtual tree
  visit_element (brig_code_entry_handler &, tree operand0, tree operand1)
  {
    /* Implement saturating arithmetics with scalar built-ins for now.
       TODO: emit GENERIC nodes for the simplest cases or at least
       emit vector built-ins.  */
    return call_builtin (m_builtin, 2, TREE_TYPE (operand0),
			 TREE_TYPE (operand0), operand0,
			 TREE_TYPE (operand1), operand1);
  }
  const BrigInstBase &m_brig_inst;
  tree m_builtin;
};

/* Implements a vector shuffle.  ARITH_TYPE is the type of the vector,
   OPERANDS[0] is the first vector, OPERAND[1] the second vector and
   OPERANDS[2] the shuffle mask in HSAIL format.  The output is a VEC_PERM_EXPR
   that implements the shuffle as a GENERIC expression.  */

tree
brig_basic_inst_handler::build_shuffle (tree arith_type,
					tree_stl_vec &operands)
{
  tree element_type
    = get_unsigned_int_type (TREE_TYPE (TREE_TYPE (operands[0])));

  /* Offsets to add to the mask values to convert from the
     HSAIL mask to VEC_PERM_EXPR masks.  VEC_PERM_EXPR mask
     assumes an index spanning from 0 to 2 times the vec
     width while HSAIL refers separately to two different
     input vectors, thus is not a "full shuffle" where all
     output elements can originate from any input element.  */
  vec<constructor_elt, va_gc> *mask_offset_vals = NULL;

  vec<constructor_elt, va_gc> *input_mask_vals = NULL;
  size_t input_mask_element_size
    = exact_log2 (TYPE_VECTOR_SUBPARTS (arith_type));

  /* Unpack the tightly packed mask elements to BIT_FIELD_REFs
     from which to construct the mask vector as understood by
     VEC_PERM_EXPR.  */
  tree mask_operand = add_temp_var ("shuffle_mask", operands[2]);

  tree mask_element_type
    = build_nonstandard_integer_type (input_mask_element_size, true);

  for (size_t i = 0; i < TYPE_VECTOR_SUBPARTS (arith_type); ++i)
    {
      tree mask_element
	= build3 (BIT_FIELD_REF, mask_element_type, mask_operand,
		  bitsize_int (input_mask_element_size),
		  bitsize_int (i * input_mask_element_size));

      mask_element = convert (element_type, mask_element);

      tree offset;
      if (i < TYPE_VECTOR_SUBPARTS (arith_type) / 2)
	offset = build_int_cst (element_type, 0);
      else
	offset
	  = build_int_cst (element_type, TYPE_VECTOR_SUBPARTS (arith_type));

      CONSTRUCTOR_APPEND_ELT (mask_offset_vals, NULL_TREE, offset);
      CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, mask_element);
    }
  tree mask_vec_type
    = build_vector_type (element_type, TYPE_VECTOR_SUBPARTS (arith_type));

  tree mask_vec = build_constructor (mask_vec_type, input_mask_vals);
  tree offset_vec = build_constructor (mask_vec_type, mask_offset_vals);

  tree mask = build2 (PLUS_EXPR, mask_vec_type, mask_vec, offset_vec);

  tree perm = build3 (VEC_PERM_EXPR, TREE_TYPE (operands[0]), operands[0],
		      operands[1], mask);
  return perm;
}

/* Unpacks (extracts) a scalar element with an index in OPERANDS[1]
   from the vector expression in OPERANDS[0].  */

tree
brig_basic_inst_handler::build_unpack (tree_stl_vec &operands)
{
  /* Implement the unpack with a shuffle that stores the unpacked
     element to the lowest bit positions in the dest.  After that
     a bitwise AND is used to clear the uppermost bits.  */
  tree src_element_type = TREE_TYPE (TREE_TYPE (operands[0]));

  /* Perform the operations with a raw (unsigned int type) type.  */
  tree element_type = get_unsigned_int_type (src_element_type);

  vec<constructor_elt, va_gc> *input_mask_vals = NULL;
  vec<constructor_elt, va_gc> *and_mask_vals = NULL;

  size_t element_count = TYPE_VECTOR_SUBPARTS (TREE_TYPE (operands[0]));
  tree vec_type = build_vector_type (element_type, element_count);

  for (size_t i = 0; i < element_count; ++i)
    {
      tree mask_element;
      if (i == 0)
	mask_element = convert (element_type, operands[1]);
      else
	mask_element = build_int_cst (element_type, 0);

      CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, mask_element);

      tree and_mask_element;
      if (i == 0)
	and_mask_element = build_int_cst (element_type, -1);
      else
	and_mask_element = build_int_cst (element_type, 0);
      CONSTRUCTOR_APPEND_ELT (and_mask_vals, NULL_TREE, and_mask_element);
    }

  tree mask_vec = build_constructor (vec_type, input_mask_vals);

  tree and_mask_vec = build_constructor (vec_type, and_mask_vals);

  tree perm = build3 (VEC_PERM_EXPR, vec_type,
		      build_reinterpret_cast (vec_type, operands[0]),
		      build_reinterpret_cast (vec_type, operands[0]), mask_vec);

  tree cleared = build2 (BIT_AND_EXPR, vec_type, perm, and_mask_vec);

  size_t s = int_size_in_bytes (TREE_TYPE (cleared)) * BITS_PER_UNIT;
  tree raw_type = build_nonstandard_integer_type (s, true);

  tree as_int = build_reinterpret_cast (raw_type, cleared);

  if (int_size_in_bytes (src_element_type) < 4)
    {
      if (INTEGRAL_TYPE_P (src_element_type))
	return extend_int (as_int, uint32_type_node, src_element_type);
    }
  return as_int;
}

/* Packs (inserts) a scalar element in OPERANDS[1]
   to the vector in OPERANDS[0] at element position defined by
   OPERANDS[2].  */

tree
brig_basic_inst_handler::build_pack (tree_stl_vec &operands)
{
  /* Implement using a bit level insertion.
     TODO: Reuse this for implementing 'bitinsert'
     without a builtin call.  */

  size_t ecount = TYPE_VECTOR_SUBPARTS (TREE_TYPE (operands[0]));
  size_t vecsize = int_size_in_bytes (TREE_TYPE (operands[0])) * BITS_PER_UNIT;
  tree wide_type = build_nonstandard_integer_type (vecsize, 1);

  tree src_vect = build_reinterpret_cast (wide_type, operands[0]);
  src_vect = add_temp_var ("src_vect", src_vect);

  tree scalar = operands[1];
  scalar = add_temp_var ("scalar", convert_to_integer (wide_type, scalar));

  tree pos = operands[2];

  /* The upper bits of the position can contain garbage.
     Zero them for well-defined semantics.  */
  tree t = build2 (BIT_AND_EXPR, TREE_TYPE (pos), operands[2],
		   build_int_cstu (TREE_TYPE (pos), ecount - 1));
  pos = add_temp_var ("pos", convert (wide_type, t));

  tree element_type = TREE_TYPE (TREE_TYPE (operands[0]));
  size_t element_width = int_size_in_bytes (element_type) * BITS_PER_UNIT;
  tree ewidth = build_int_cstu (wide_type, element_width);

  tree bitoffset = build2 (MULT_EXPR, wide_type, ewidth, pos);
  bitoffset = add_temp_var ("offset", bitoffset);

  uint64_t mask_int
    = element_width == 64 ? (uint64_t) -1 : ((uint64_t) 1 << element_width) - 1;

  tree mask = build_int_cstu (wide_type, mask_int);

  mask = add_temp_var ("mask", convert_to_integer (wide_type, mask));

  tree clearing_mask
    = build1 (BIT_NOT_EXPR, wide_type,
	      build2 (LSHIFT_EXPR, wide_type, mask, bitoffset));

  tree zeroed_element
    = build2 (BIT_AND_EXPR, wide_type, src_vect, clearing_mask);

  /* TODO: Is the AND necessary: does HSA define what
     happens if the upper bits in the inserted element are not
     zero? */
  tree element_in_position
    = build2 (LSHIFT_EXPR, wide_type,
	      build2 (BIT_AND_EXPR, wide_type, scalar, mask), bitoffset);

  tree inserted
    = build2 (BIT_IOR_EXPR, wide_type, zeroed_element, element_in_position);
  return inserted;
}

/* Implement the unpack{lo,hi}.  BRIG_OPCODE should tell which one and
   ARITH_TYPE describe the type of the vector arithmetics.
   OPERANDS[0] and OPERANDS[1] are the input vectors.  */

tree
brig_basic_inst_handler::build_unpack_lo_or_hi (BrigOpcode16_t brig_opcode,
						tree arith_type,
						tree_stl_vec &operands)
{
  tree element_type = get_unsigned_int_type (TREE_TYPE (arith_type));
  tree mask_vec_type
    = build_vector_type (element_type, TYPE_VECTOR_SUBPARTS (arith_type));

  size_t element_count = TYPE_VECTOR_SUBPARTS (arith_type);
  vec<constructor_elt, va_gc> *input_mask_vals = NULL;

  size_t offset = (brig_opcode == BRIG_OPCODE_UNPACKLO) ? 0 : element_count / 2;

  for (size_t i = 0; i < element_count / 2; ++i)
    {
      CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE,
			      build_int_cst (element_type, offset + i));
      CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE,
			      build_int_cst (element_type,
					     offset + i + element_count));
    }

  tree mask_vec = build_constructor (mask_vec_type, input_mask_vals);

  tree perm = build3 (VEC_PERM_EXPR, TREE_TYPE (operands[0]), operands[0],
		      operands[1], mask_vec);
  return perm;
}

/* Builds a basic instruction expression from a BRIG instruction.  BRIG_OPCODE
   is the opcode, BRIG_TYPE the brig type of the instruction, ARITH_TYPE the
   desired tree type for the instruction, and OPERANDS the instruction's
   input operands already converted to tree nodes.  */

tree
brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode,
					  BrigType16_t brig_type,
					  tree arith_type,
					  tree_stl_vec &operands)
{
  tree_code opcode = get_tree_code_for_hsa_opcode (brig_opcode, brig_type);

  BrigType16_t inner_type = brig_type & BRIG_TYPE_BASE_MASK;

  tree instr_inner_type
    = VECTOR_TYPE_P (arith_type) ? TREE_TYPE (arith_type) : arith_type;

  if (opcode == RSHIFT_EXPR || opcode == LSHIFT_EXPR)
    {
      /* HSA defines modulo/clipping behavior for shift amounts larger
	 than the bit width, while tree.def leaves it undefined.
	 We need to mask the upper bits to ensure the defined behavior.  */
      tree scalar_mask
	= build_int_cst (instr_inner_type,
			 gccbrig_hsa_type_bit_size (inner_type) - 1);

      tree mask = VECTOR_TYPE_P (arith_type)
		    ? build_vector_from_val (arith_type, scalar_mask)
		    : scalar_mask;

      /* The shift amount is a scalar, broadcast it to produce
	 a vector shift.  */
      if (VECTOR_TYPE_P (arith_type))
	operands[1] = build_vector_from_val (arith_type, operands[1]);
      operands[1] = build2 (BIT_AND_EXPR, arith_type, operands[1], mask);
    }

  size_t input_count = operands.size ();
  size_t output_count = gccbrig_hsa_opcode_op_output_p (brig_opcode, 0) ?
    1 : 0;

  if (opcode == TREE_LIST)
    {
      /* There was no direct GENERIC opcode for the instruction;
	 try to emulate it with a chain of GENERIC nodes.  */
      if (brig_opcode == BRIG_OPCODE_MAD || brig_opcode == BRIG_OPCODE_MAD24)
	{
	  /* There doesn't seem to be a "standard" MAD built-in in gcc so let's
	     use a chain of multiply + add for now (double rounding method).
	     It should be easier for optimizers than a custom built-in call
	     WIDEN_MULT_EXPR is close, but requires a double size result
	     type.  */
	  tree mult_res
	    = build2 (MULT_EXPR, arith_type, operands[0], operands[1]);
	  return build2 (PLUS_EXPR, arith_type, mult_res, operands[2]);
	}
      else if (brig_opcode == BRIG_OPCODE_MAD24HI)
	{
	  tree mult_res
	    = build2 (MULT_HIGHPART_EXPR, arith_type, operands[0], operands[1]);
	  return build2 (PLUS_EXPR, arith_type, mult_res, operands[2]);
	}
      else if (brig_opcode == BRIG_OPCODE_SHUFFLE)
	{
	  return build_shuffle (arith_type, operands);
	}
      else if (brig_opcode == BRIG_OPCODE_UNPACKLO
	       || brig_opcode == BRIG_OPCODE_UNPACKHI)
	{
	  return build_unpack_lo_or_hi (brig_opcode, arith_type, operands);
	}
      else if (brig_opcode == BRIG_OPCODE_UNPACK)
	{
	  return build_unpack (operands);
	}
      else if (brig_opcode == BRIG_OPCODE_PACK)
	{
	  return build_pack (operands);
	}
      else if (brig_opcode == BRIG_OPCODE_NRSQRT)
	{
	  /* Implement as 1.0/sqrt (x) and assume gcc instruction selects to
	     native ISA other than a division, if available.
	     TODO: this will happen only with unsafe math optimizations
	     on which cannot be used in general to remain HSAIL compliant.
	     Perhaps a builtin call would be better option here.  */
	  return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
			 expand_or_call_builtin (BRIG_OPCODE_SQRT, brig_type,
						 arith_type, operands));
	}
      else if (brig_opcode == BRIG_OPCODE_NRCP)
	{
	  /* Implement as 1.0/x and assume gcc instruction selects to
	     native ISA other than a division, if available.  */
	  return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
			 operands[0]);
	}
      else if (brig_opcode == BRIG_OPCODE_LANEID
	       || brig_opcode == BRIG_OPCODE_MAXWAVEID
	       || brig_opcode == BRIG_OPCODE_WAVEID)
	{
	  /* Assuming WAVESIZE 1 (for now), therefore LANEID, WAVEID and
	     MAXWAVEID always return 0.  */
	  return build_zero_cst (arith_type);
	}
      else
	gcc_unreachable ();
    }
  else if (opcode == CALL_EXPR)
    return expand_or_call_builtin (brig_opcode, brig_type, arith_type,
				   operands);
  else if (output_count == 1)
    {
      if (input_count == 1)
	{
	  if (opcode == MODIFY_EXPR)
	    return operands[0];
	  else
	    return build1 (opcode, arith_type, operands[0]);
	}
      else if (input_count == 2)
	return build2 (opcode, arith_type, operands[0], operands[1]);
      else if (input_count == 3)
	return build3 (opcode, arith_type, operands[0], operands[1],
		       operands[2]);
      else
	gcc_unreachable ();
    }
  else
    gcc_unreachable ();

  return NULL_TREE;
}

/* Handles the basic instructions, including packed instructions. Deals
   with the different packing modes by unpacking/packing the wanted
   elements.  Delegates most of the instruction cases to build_inst_expr(). */

size_t
brig_basic_inst_handler::operator () (const BrigBase *base)
{
  const BrigInstBase *brig_inst = (const BrigInstBase *) base;

  tree_stl_vec operands = build_operands (*brig_inst);

  size_t output_count
    = gccbrig_hsa_opcode_op_output_p (brig_inst->opcode, 0) ? 1 : 0;
  size_t input_count
    = operands.size () == 0 ? 0 : (operands.size () - output_count);

  gcc_assert (output_count == 0 || output_count == 1);

  tree_stl_vec::iterator first_input_i = operands.begin ();
  if (output_count > 0 && operands.size () > 0)
    ++first_input_i;

  tree_stl_vec in_operands;
  in_operands.assign (first_input_i, operands.end ());

  BrigType16_t brig_inst_type = brig_inst->type;

  if (brig_inst->opcode == BRIG_OPCODE_NOP)
    return base->byteCount;
  else if (brig_inst->opcode == BRIG_OPCODE_FIRSTBIT
	   || brig_inst->opcode == BRIG_OPCODE_LASTBIT
	   || brig_inst->opcode == BRIG_OPCODE_SAD)
    /* These instructions are reported to be always 32b in HSAIL, but we want
       to treat them according to their input argument's type to select the
       correct instruction/builtin.  */
    brig_inst_type
      = gccbrig_tree_type_to_hsa_type (TREE_TYPE (in_operands[0]));

  tree instr_type = gccbrig_tree_type_for_hsa_type (brig_inst_type);

  if (!instr_type)
    {
      gcc_unreachable ();
      return base->byteCount;
    }

  bool is_vec_instr = hsa_type_packed_p (brig_inst_type);

  size_t element_size_bits;
  size_t element_count;

  if (is_vec_instr)
    {
      BrigType16_t brig_element_type = brig_inst_type & BRIG_TYPE_BASE_MASK;
      element_size_bits = gccbrig_hsa_type_bit_size (brig_element_type);
      element_count = gccbrig_hsa_type_bit_size (brig_inst_type)
	/ gccbrig_hsa_type_bit_size (brig_element_type);
    }
  else
    {
      element_size_bits = gccbrig_hsa_type_bit_size (brig_inst_type);
      element_count = 1;
    }

  /* The actual arithmetics type that should be performed with the
     operation.  This is not always the same as the original BRIG
     opcode's type due to implicit conversions of storage-only f16.  */
  tree arith_type = gccbrig_is_bit_operation (brig_inst->opcode)
		      ? gccbrig_tree_type_for_hsa_type (brig_inst_type)
		      : get_tree_expr_type_for_hsa_type (brig_inst_type);

  tree instr_expr = NULL_TREE;

  BrigPack8_t p = BRIG_PACK_NONE;
  if (brig_inst->base.kind == BRIG_KIND_INST_MOD)
    p = ((const BrigInstMod *) brig_inst)->pack;
  else if (brig_inst->base.kind == BRIG_KIND_INST_CMP)
    p = ((const BrigInstCmp *) brig_inst)->pack;

  if (p == BRIG_PACK_PS || p == BRIG_PACK_PSSAT)
    in_operands[1] = build_lower_element_broadcast (in_operands[1]);
  else if (p == BRIG_PACK_SP || p == BRIG_PACK_SPSAT)
    in_operands[0] = build_lower_element_broadcast (in_operands[0]);

  tree_code opcode
    = get_tree_code_for_hsa_opcode (brig_inst->opcode, brig_inst_type);

  if (p >= BRIG_PACK_PPSAT && p <= BRIG_PACK_PSAT)
    {
      scalarized_sat_arithmetics sat_arith (*brig_inst);
      gcc_assert (input_count == 2);
      instr_expr = sat_arith (*this, in_operands[0], in_operands[1]);
    }
  else if (opcode == RETURN_EXPR)
    {
      if (m_parent.m_cf->m_is_kernel)
	{
	  tree goto_stmt
	    = build1 (GOTO_EXPR, void_type_node, m_parent.m_cf->m_exit_label);
	  m_parent.m_cf->append_statement (goto_stmt);
	  return base->byteCount;
	}
      else
	{
	  m_parent.m_cf->append_return_stmt ();
	  return base->byteCount;
	}
    }
  else if (opcode == MULT_HIGHPART_EXPR &&
	   is_vec_instr && element_size_bits < 64)
    {
      /* MULT_HIGHPART_EXPR works only on target dependent vector sizes and
	 even the scalars do not seem to work at least for char elements.

	 Let's fall back to scalarization and promotion of the vector elements
	 to larger types with the MULHI computed as a regular MUL.
	 MULHI for 2x64b seems to work with the Intel CPUs I've tested so
	 that is passed on for vector processing so there is no need for
	 128b scalar arithmetics.

	 This is not modular as these type of things do not belong to the
	 frontend, there should be a legalization phase before the backend
	 that figures out the best way to compute the MULHI for any
	 integer vector datatype.

	 TODO: promote to larger vector types instead.  For example
	 MULT_HIGHPART_EXPR with s8x8 doesn't work, but s16x8 seems to at least
	 with my x86-64.
      */
      tree_stl_vec operand0_elements;
      if (input_count > 0)
	unpack (in_operands[0], operand0_elements);

      tree_stl_vec operand1_elements;
      if (input_count > 1)
	unpack (in_operands[1], operand1_elements);

      tree_stl_vec result_elements;

      tree scalar_type = TREE_TYPE (arith_type);
      BrigType16_t element_type = brig_inst_type & BRIG_TYPE_BASE_MASK;
      tree promoted_type = short_integer_type_node;
      switch (element_type)
	{
	case BRIG_TYPE_S8:
	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S16);
	  break;
	case BRIG_TYPE_U8:
	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U16);
	  break;
	case BRIG_TYPE_S16:
	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S32);
	  break;
	case BRIG_TYPE_U16:
	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U32);
	  break;
	case BRIG_TYPE_S32:
	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S64);
	  break;
	case BRIG_TYPE_U32:
	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U64);
	  break;
	default:
	  gcc_unreachable ();
	}

      size_t promoted_type_size = int_size_in_bytes (promoted_type) * 8;

      for (size_t i = 0; i < TYPE_VECTOR_SUBPARTS (arith_type); ++i)
	{
	  tree operand0 = convert (promoted_type, operand0_elements.at (i));
	  tree operand1 = convert (promoted_type, operand1_elements.at (i));

	  tree scalar_expr
	    = build2 (MULT_EXPR, promoted_type, operand0, operand1);

	  scalar_expr
	    = build2 (RSHIFT_EXPR, promoted_type, scalar_expr,
		      build_int_cstu (promoted_type, promoted_type_size / 2));

	  result_elements.push_back (convert (scalar_type, scalar_expr));
	}
      instr_expr = pack (result_elements);
    }
  else
    {
      /* 'class' is always of b1 type, let's consider it by its
	 float type when building the instruction to find the
	 correct builtin.  */
      if (brig_inst->opcode == BRIG_OPCODE_CLASS)
	brig_inst_type = ((const BrigInstSourceType *) base)->sourceType;
      instr_expr = build_inst_expr (brig_inst->opcode, brig_inst_type,
				     arith_type, in_operands);
    }

  if (instr_expr == NULL_TREE)
    {
      gcc_unreachable ();
      return base->byteCount;
    }

  if (p == BRIG_PACK_SS || p == BRIG_PACK_S || p == BRIG_PACK_SSSAT
      || p == BRIG_PACK_SSAT)
    {
      /* In case of _s_ or _ss_, select only the lowest element
	 from the new input to the output.  We could extract
	 the element and use a scalar operation, but try
	 to keep data in vector registers as much as possible
	 to avoid copies between scalar and vector datapaths.  */
      tree old_value;
      tree half_storage_type = gccbrig_tree_type_for_hsa_type (brig_inst_type);
      bool is_fp16_operation
	= (brig_inst_type & BRIG_TYPE_BASE_MASK) == BRIG_TYPE_F16
	&& !gccbrig_is_bit_operation (brig_inst->opcode);

      if (is_fp16_operation)
	old_value = build_h2f_conversion
	  (build_reinterpret_cast (half_storage_type, operands[0]));
      else
	old_value
	  = build_reinterpret_cast (TREE_TYPE (instr_expr), operands[0]);

      size_t esize = is_fp16_operation ? 32 : element_size_bits;

      /* Construct a permutation mask where other elements than the lowest one
	 is picked from the old_value.  */
      tree mask_inner_type = build_nonstandard_integer_type (esize, 1);
      vec<constructor_elt, va_gc> *constructor_vals = NULL;
      for (size_t i = 0; i < element_count; ++i)
	{
	  tree cst;

	  if (i == 0)
	    cst = build_int_cstu (mask_inner_type, element_count);
	  else
	    cst = build_int_cstu (mask_inner_type, i);
	  CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, cst);
	}
      tree mask_vec_type = build_vector_type (mask_inner_type, element_count);
      tree mask = build_vector_from_ctor (mask_vec_type, constructor_vals);

      tree new_value = create_tmp_var (TREE_TYPE (instr_expr), "new_output");
      tree assign
	= build2 (MODIFY_EXPR, TREE_TYPE (instr_expr), new_value, instr_expr);
      m_parent.m_cf->append_statement (assign);

      instr_expr
	= build3 (VEC_PERM_EXPR, arith_type, old_value, new_value, mask);

      tree lower_output = create_tmp_var (TREE_TYPE (instr_expr), "s_output");
      tree assign_lower = build2 (MODIFY_EXPR, TREE_TYPE (instr_expr),
				  lower_output, instr_expr);
      m_parent.m_cf->append_statement (assign_lower);
      instr_expr = lower_output;
    }

  if (output_count == 1)
    build_output_assignment (*brig_inst, operands[0], instr_expr);
  else
    m_parent.m_cf->append_statement (instr_expr);
  return base->byteCount;
}

/* Create an expression that broadcasts the lowest element of the
   vector in VEC_OPERAND to all elements of the returned vector.  */

tree
brig_basic_inst_handler::build_lower_element_broadcast (tree vec_operand)
{
  /* Build the broadcast using shuffle because there's no
     direct broadcast in GENERIC and this way there's no need for
     a separate extract of the lowest element.  */
  tree element_type = TREE_TYPE (TREE_TYPE (vec_operand));
  size_t esize = 8 * int_size_in_bytes (element_type);

  size_t element_count = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_operand));
  tree mask_inner_type = build_nonstandard_integer_type (esize, 1);
  vec<constructor_elt, va_gc> *constructor_vals = NULL;

  /* Construct the mask.  */
  for (size_t i = 0; i < element_count; ++i)
    {
      tree cst = build_int_cstu (mask_inner_type, element_count);
      CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, cst);
    }
  tree mask_vec_type = build_vector_type (mask_inner_type, element_count);
  tree mask = build_vector_from_ctor (mask_vec_type, constructor_vals);

  return build3 (VEC_PERM_EXPR, TREE_TYPE (vec_operand), vec_operand,
		 vec_operand, mask);
}

/* Returns the tree code that should be used to implement the given
   HSA instruction opcode (BRIG_OPCODE) for the given type of instruction
   (BRIG_TYPE).  In case the opcode cannot be mapped to a TREE node directly,
   returns TREE_LIST (if it can be emulated with a simple chain of tree
   nodes) or CALL_EXPR if the opcode should be implemented using a builtin
   call.  */

tree_code
brig_basic_inst_handler::get_tree_code_for_hsa_opcode
  (BrigOpcode16_t brig_opcode, BrigType16_t brig_type) const
{
  BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK;
  switch (brig_opcode)
    {
    case BRIG_OPCODE_NOP:
      return NOP_EXPR;
    case BRIG_OPCODE_ADD:
      return PLUS_EXPR;
    case BRIG_OPCODE_CMOV:
      if (brig_inner_type == brig_type)
	return COND_EXPR;
      else
	return VEC_COND_EXPR;
    case BRIG_OPCODE_SUB:
      return MINUS_EXPR;
    case BRIG_OPCODE_MUL:
    case BRIG_OPCODE_MUL24:
      return MULT_EXPR;
    case BRIG_OPCODE_MULHI:
    case BRIG_OPCODE_MUL24HI:
      return MULT_HIGHPART_EXPR;
    case BRIG_OPCODE_DIV:
      if (gccbrig_is_float_type (brig_inner_type))
	return RDIV_EXPR;
      else
	return TRUNC_DIV_EXPR;
    case BRIG_OPCODE_NEG:
      return NEGATE_EXPR;
    case BRIG_OPCODE_MIN:
      if (gccbrig_is_float_type (brig_inner_type))
	return CALL_EXPR;
      else
	return MIN_EXPR;
    case BRIG_OPCODE_MAX:
      if (gccbrig_is_float_type (brig_inner_type))
	return CALL_EXPR;
      else
	return MAX_EXPR;
    case BRIG_OPCODE_FMA:
      return FMA_EXPR;
    case BRIG_OPCODE_ABS:
      return ABS_EXPR;
    case BRIG_OPCODE_SHL:
      return LSHIFT_EXPR;
    case BRIG_OPCODE_SHR:
      return RSHIFT_EXPR;
    case BRIG_OPCODE_OR:
      return BIT_IOR_EXPR;
    case BRIG_OPCODE_XOR:
      return BIT_XOR_EXPR;
    case BRIG_OPCODE_AND:
      return BIT_AND_EXPR;
    case BRIG_OPCODE_NOT:
      return BIT_NOT_EXPR;
    case BRIG_OPCODE_RET:
      return RETURN_EXPR;
    case BRIG_OPCODE_MOV:
    case BRIG_OPCODE_LDF:
      return MODIFY_EXPR;
    case BRIG_OPCODE_LD:
    case BRIG_OPCODE_ST:
      return MEM_REF;
    case BRIG_OPCODE_BR:
      return GOTO_EXPR;
    case BRIG_OPCODE_REM:
      if (brig_type == BRIG_TYPE_U64 || brig_type == BRIG_TYPE_U32)
	return TRUNC_MOD_EXPR;
      else
	return CALL_EXPR;
    case BRIG_OPCODE_NRCP:
    case BRIG_OPCODE_NRSQRT:
      /* Implement as 1/f (x).  gcc should pattern detect that and
	 use a native instruction, if available, for it.  */
      return TREE_LIST;
    case BRIG_OPCODE_FLOOR:
    case BRIG_OPCODE_CEIL:
    case BRIG_OPCODE_SQRT:
    case BRIG_OPCODE_NSQRT:
    case BRIG_OPCODE_RINT:
    case BRIG_OPCODE_TRUNC:
    case BRIG_OPCODE_POPCOUNT:
    case BRIG_OPCODE_COPYSIGN:
    case BRIG_OPCODE_NCOS:
    case BRIG_OPCODE_NSIN:
    case BRIG_OPCODE_NLOG2:
    case BRIG_OPCODE_NEXP2:
    case BRIG_OPCODE_NFMA:
      /* Class has type B1 regardless of the float type, thus
	 the below builtin map search cannot find it.  */
    case BRIG_OPCODE_CLASS:
    case BRIG_OPCODE_WORKITEMABSID:
      return CALL_EXPR;
    default:

      /* Some BRIG opcodes can use the same builtins for unsigned and
	 signed types.  Force these cases to unsigned types.
      */

      if (brig_opcode == BRIG_OPCODE_BORROW
	  || brig_opcode == BRIG_OPCODE_CARRY
	  || brig_opcode == BRIG_OPCODE_LASTBIT
	  || brig_opcode == BRIG_OPCODE_BITINSERT)
	{
	  if (brig_type == BRIG_TYPE_S32)
	    brig_type = BRIG_TYPE_U32;
	  else if (brig_type == BRIG_TYPE_S64)
	    brig_type = BRIG_TYPE_U64;
	}


      builtin_map::const_iterator i
	= s_custom_builtins.find (std::make_pair (brig_opcode, brig_type));
      if (i != s_custom_builtins.end ())
	return CALL_EXPR;
      else if (s_custom_builtins.find
	       (std::make_pair (brig_opcode, brig_inner_type))
	       != s_custom_builtins.end ())
	return CALL_EXPR;
      if (brig_inner_type == BRIG_TYPE_F16
	  && s_custom_builtins.find
	  (std::make_pair (brig_opcode, BRIG_TYPE_F32))
	  != s_custom_builtins.end ())
	return CALL_EXPR;
      break;
    }
  return TREE_LIST; /* Emulate using a chain of nodes.  */
}