summaryrefslogtreecommitdiff
path: root/lib/libc/aarch64/string/memccpy.S
blob: 7d9fdb14b84b9db3a3da0629b77e3d17f2091172 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
/*-
 * SPDX-License-Identifier: BSD-2-Clause
 *
 * Copyright (c) 2024 Getz Mikalsen <getz@FreeBSD.org>
*/

#include <machine/asm.h>

	.weak	memccpy
	.set	memccpy, __memccpy
	.text

ENTRY(__memccpy)
	subs	x3, x3, #1
	b.lo	.L0

	dup	v0.16b,	w2

	mov	x9, x0			// stash copy of src pointer
	bic	x10, x1, #0xf		// src aligned
	and	x11, x1, #0xf		// src offset

	ldr	q1, [x10]
	cmeq	v1.16b, v1.16b, v0.16b	// bytewise compare against src char

	mov	x8, #-1			// prepare a 0xfff..fff register
	mov	x6, #0xf

	lsl	x12, x11, #2
	lsl	x8, x8, x12		// mask of bytes in the string

	shrn	v1.8b, v1.8h, #4
	fmov	x5, d1

	sub	x12, x11, #32
	adds	x12, x12, x3		// distance from alignment boundary - 32
	b.cc	.Lrunt			// branch if buffer length is 32 or less

	ands	x8, x8, x5
	b.eq	0f

	/* match in first chunk */
	rbit	x8, x8
	clz	x8, x8			// index of mismatch
	lsr	x8, x8, #2

	sub	x8, x8, x11		// ... from beginning of the string

	add	x0, x0, x8
	add	x4, x9, x8		// dst + cnt
	add	x5, x1, x8		// src + cnt
	add	x0, x0, #1

	b	.L0816

0:
	ldr	q3,	[x10, #16]	// load second string chunk
	ldr	q2,	[x1]		// load true head
	cmeq	v1.16b, v3.16b, v0.16b	// char found in second chunk?

	/* process second chunk */
	shrn	v1.8b, v1.8h, #4
	fmov	x5, d1

	cbz	x5, 0f

	/* match in second chunk */
	rbit	x8, x5
	clz	x8, x8			// index of mismatch
	lsr	x8, x8, #2

	sub	x11, x11, #16
	sub	x8, x8, x11		// adjust for alignment offset
	add	x0, x0, x8		// return value
	add	x0, x0, #1

	add	x4, x9, x8
	add	x5, x1, x8
	b	.L1732

0:
	/* string didn't end in second chunk and neither did buffer */
	ldr	q1,	[x10, #32]	// load next string chunk
	str	q2,	[x0]		// deposit head into buffer
	sub	x0, x0, x11		// adjust x0
	mov	x3, x12
	str	q3,	[x0, #16]	// deposit second chunk

	add	x10, x10, #32		// advance src
	add	x0, x0, #32		// advance dst
	subs	x3, x3, #16		// enough left for another round?
	b.lo	1f

	/* main loop unrolled twice */
	.p2align 4
0:
	cmeq	v2.16b, v1.16b, v0.16b	// char found in second chunk?
	shrn	v2.8b, v2.8h, #4
	fmov	x5, d2

	cbnz	x5, 3f

	str	q1, [x0]
	ldr	q1, [x10, #16]		// load next chunk

	cmp	x3, #16			// more than a full chunk left?
	b.lo	2f

	add	x10, x10, #32		// advance pointers
	add	x0, x0, #32

	cmeq	v2.16b, v1.16b, v0.16b	// char found in second chunk?
	shrn	v2.8b, v2.8h, #4
	fmov	x5, d2
	cbnz	x5, 4f			// process chunk if match

	str	q1, [x0, #-16]
	ldr	q1, [x10]		// load next chunk

	subs	x3, x3, #32
	b.hs	0b

1:
	sub	x10, x10, #16		// undo second advancement
	add	x3, x3, #16
	sub	x0, x0, #16

	/* 1--16 bytes left in the buffer but string has not ended yet */
2:
	cmeq	v2.16b, v1.16b, v0.16b	// char found in second chunk?
	shrn	v2.8b, v2.8h, #4
	fmov	x4, d2

	lsl	x5, x3, #2		// shift 0xf to the limits position
	lsl	x5, x6, x5
	orr	x8, x4, x5		// insert match in mask at limit

	rbit	x8, x8			// simulate x86 tzcnt
	clz	x7, x8			// index of mismatch
	lsr	x8, x7, #2

	lsl	x5, x6, x7		// simulate x86 bt with shifted 0xf

	add	x8, x8, #1
	add	x0, x0, x8

	ldr	q1, [x10, x8]		// load tail
	str	q1, [x0]		// store tail

	add	x0, x0, #16

	tst	x4, x5			// terminator encountered inside buffer?
	csel	x0, x0, xzr, ne		// if yes, return pointer, else NUL
	ret

4:
	sub	x10, x10, #16		// undo second advancement
	sub	x0, x0, #16		// undo second advancement

3:
	rbit	x8, x5
	clz	x8, x8			// index of mismatch
	lsr	x3, x8, #2

	add	x0, x0, x3		// restore dst pointer
	add	x10, x10, x3
	ldr	q1, [x10, #-15]
	str	q1, [x0, #-15]
	add	x0, x0, #1
	ret

.Lrunt:
	add	x13, x11, x3

	mov	x7, x5			// keep a copy of original match mask

	lsl	x4, x12, #2		// shift 0xf to the limits position
	lsl	x4, x6, x4

	cmp	x13, #16		// dont induce match if limit >=16
	csel	x4, x4, xzr, lo
	orr	x5, x5, x4		// insert match in mask at limit

	ands	x8, x8, x5		// if match always fall through
	b.ne	0f

	ldr	q4,	[x10, #16]	// load second string chunk
	cmeq	v1.16b, v4.16b, v0.16b	// char found in second chunk?

	/* process second chunk */
	shrn	v1.8b, v1.8h, #4
	fmov	x8, d1
	mov	x7, x8

	lsl	x4, x12, #2
	lsl	x4, x6, x4
	orr	x8, x8, x4		// induce match in upper bytes of mask

	rbit	x8, x8
	clz	x4, x8			// index of mismatch
	lsr	x8, x4, #2
	add	x8, x8, #16		// no match in first chunk
	b	1f

0:
	rbit	x8, x8
	clz	x4, x8			// index of mismatch
	lsr	x8, x4, #2
1:
	add	x0, x0, x8		// return value if terminator not found
	sub	x0, x0, x11
	add	x0, x0, #1

	/* check if we encountered a match or the limit first */
	lsl	x5, x6, x4
	ands	x7, x7, x5		// was the terminator present?
	csel	x0, xzr, x0, eq		// return value based on what we matched

	sub	x8, x8, x11
	add	x4, x9, x8		// dst + cnt
	add	x5, x1, x8		// src + cnt

	/* copy 17-32 bytes */
.L1732:
	cmp	x8, #16
	b.lo	.L0816
	add	x5, x5, #1		// ldp offsets are powers of 2
	add	x4, x4, #1
	ldp	x16, x17, [x1]
	ldp	x12, x13, [x5, #-16]
	stp	x16, x17, [x9]
	stp	x12, x13, [x4, #-16]
	ret

	/* Copy 8-16 bytes */
.L0816:
	tbz	x8, #3, .L0407
	ldr	x16, [x1]
	ldr	x17, [x5, #-7]
	str	x16, [x9]
	str	x17, [x4, #-7]
	ret

	/* Copy 4-7 bytes */
	.p2align 4
.L0407:
	cmp	x8, #3
	b.lo	.L0103
	ldr	w16, [x1]
	ldr	w18, [x5, #-3]
	str	w16, [x9]
	str	w18, [x4, #-3]
	ret

	/* Copy 1-3 bytes */
	.p2align 4
.L0103:
	lsr	x14, x8, #1
	ldrb	w16, [x1]
	ldrb	w15, [x5]
	ldrb	w18, [x1, x14]
	strb	w16, [x9]
	strb	w18, [x9, x14]
	strb	w15, [x4]
	ret

.L0:
	eor	x0, x0, x0
	ret

END(__memccpy)