arch/riscv/lib/memcpy.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159

/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2013 Regents of the University of California
 */

#include <linux/linkage.h>
#include <asm/asm.h>

/* void *memcpy(void *, const void *, size_t) */
ENTRY(__memcpy)
WEAK(memcpy)
	/* Save for return value */
	mv	t6, a0

	/*
	 * Register allocation for code below:
	 * a0 - start of uncopied dst
	 * a1 - start of uncopied src
	 * t0 - end of uncopied dst
	 */
	add	t0, a0, a2

	/*
	 * Use bytewise copy if too small.
	 *
	 * This threshold must be at least 2*SZREG to ensure at least one
	 * wordwise copy is performed. It is chosen to be 16 because it will
	 * save at least 7 iterations of bytewise copy, which pays off the
	 * fixed overhead.
	 */
	li	a3, 16
	bltu	a2, a3, .Lbyte_copy_tail

	/*
	 * Bytewise copy first to align a0 to word boundary.
	 */
	addi	a2, a0, SZREG-1
	andi	a2, a2, ~(SZREG-1)
	beq	a0, a2, 2f
1:
	lb	a5, 0(a1)
	addi	a1, a1, 1
	sb	a5, 0(a0)
	addi	a0, a0, 1
	bne	a0, a2, 1b
2:

	/*
	 * Now a0 is word-aligned. If a1 is also word aligned, we could perform
	 * aligned word-wise copy. Otherwise we need to perform misaligned
	 * word-wise copy.
	 */
	andi	a3, a1, SZREG-1
	bnez	a3, .Lmisaligned_word_copy

	/* Unrolled wordwise copy */
	addi	t0, t0, -(16*SZREG-1)
	bgeu	a0, t0, 2f
1:
	REG_L	a2,        0(a1)
	REG_L	a3,    SZREG(a1)
	REG_L	a4,  2*SZREG(a1)
	REG_L	a5,  3*SZREG(a1)
	REG_L	a6,  4*SZREG(a1)
	REG_L	a7,  5*SZREG(a1)
	REG_L	t1,  6*SZREG(a1)
	REG_L	t2,  7*SZREG(a1)
	REG_L	t3,  8*SZREG(a1)
	REG_L	t4,  9*SZREG(a1)
	REG_L	t5, 10*SZREG(a1)
	REG_S	a2,        0(a0)
	REG_S	a3,    SZREG(a0)
	REG_S	a4,  2*SZREG(a0)
	REG_S	a5,  3*SZREG(a0)
	REG_S	a6,  4*SZREG(a0)
	REG_S	a7,  5*SZREG(a0)
	REG_S	t1,  6*SZREG(a0)
	REG_S	t2,  7*SZREG(a0)
	REG_S	t3,  8*SZREG(a0)
	REG_S	t4,  9*SZREG(a0)
	REG_S	t5, 10*SZREG(a0)
	REG_L	a2, 11*SZREG(a1)
	REG_L	a3, 12*SZREG(a1)
	REG_L	a4, 13*SZREG(a1)
	REG_L	a5, 14*SZREG(a1)
	REG_L	a6, 15*SZREG(a1)
	addi	a1, a1, 16*SZREG
	REG_S	a2, 11*SZREG(a0)
	REG_S	a3, 12*SZREG(a0)
	REG_S	a4, 13*SZREG(a0)
	REG_S	a5, 14*SZREG(a0)
	REG_S	a6, 15*SZREG(a0)
	addi	a0, a0, 16*SZREG
	bltu	a0, t0, 1b
2:
	/* Post-loop increment by 16*SZREG-1 and pre-loop decrement by SZREG-1 */
	addi	t0, t0, 15*SZREG

	/* Wordwise copy */
	bgeu	a0, t0, 2f
1:
	REG_L	a5, 0(a1)
	addi	a1, a1, SZREG
	REG_S	a5, 0(a0)
	addi	a0, a0, SZREG
	bltu	a0, t0, 1b
2:
	addi	t0, t0, SZREG-1

.Lbyte_copy_tail:
	/*
	 * Bytewise copy anything left.
	 */
	beq	a0, t0, 2f
1:
	lb	a5, 0(a1)
	addi	a1, a1, 1
	sb	a5, 0(a0)
	addi	a0, a0, 1
	bne	a0, t0, 1b
2:

	mv	a0, t6
	ret

.Lmisaligned_word_copy:
	/*
	 * Misaligned word-wise copy.
	 * For misaligned copy we still perform word-wise copy, but we need to
	 * use the value fetched from the previous iteration and do some shifts.
	 * This is safe because we wouldn't access more words than necessary.
	 */

	/* Calculate shifts */
	slli	t3, a3, 3
	sub	t4, x0, t3 /* negate is okay as shift will only look at LSBs */

	/* Load the initial value and align a1 */
	andi	a1, a1, ~(SZREG-1)
	REG_L	a5, 0(a1)

	addi	t0, t0, -(SZREG-1)
	/* At least one iteration will be executed here, no check */
1:
	srl	a4, a5, t3
	REG_L	a5, SZREG(a1)
	addi	a1, a1, SZREG
	sll	a2, a5, t4
	or	a2, a2, a4
	REG_S	a2, 0(a0)
	addi	a0, a0, SZREG
	bltu	a0, t0, 1b

	/* Update pointers to correct value */
	addi	t0, t0, SZREG-1
	add	a1, a1, a3

	j	.Lbyte_copy_tail
END(__memcpy)