summaryrefslogtreecommitdiff
path: root/lib/libc/amd64/string/memccpy.S
blob: 69b650fffc33447d11ef54796dbe45cca9c0faf1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
/*
 * Copyright (c) 2023, 2024 The FreeBSD Foundation
 *
 * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
 * under sponsorship from the FreeBSD Foundation.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE
 */

#include <machine/asm.h>

#include "amd64_archlevel.h"

#define ALIGN_TEXT	.p2align 4, 0x90

	.weak memccpy
	.set memccpy, __memccpy
ARCHFUNCS(__memccpy)
	ARCHFUNC(__memccpy, scalar)
	ARCHFUNC(__memccpy, baseline)
ENDARCHFUNCS(__memccpy)

ARCHENTRY(__memccpy, scalar)
	push	%rbp			# establish stack frame
	mov	%rsp, %rbp
	push	%rax			# dummy push for alignment
	push	%rbx
	push	%rdi
	push	%rsi

	mov	%rsi, %rdi
	mov	%edx, %esi
	mov	%rcx, %rdx
	mov	%rcx, %rbx
	call	CNAME(__memchr)		# ptr = memchr(src, c, len)

	pop	%rsi
	pop	%rdi
	lea	1(%rax), %rdx
	sub	%rsi, %rdx		# size = ptr - src + 1
	mov	%rbx, %rcx
	lea	(%rdi, %rdx, 1), %rbx	# res = dest + size
	test	%rax, %rax		# if (ptr == NULL)
	cmovz	%rcx, %rdx		# size = len
	cmovz	%rax, %rbx		# res = NULL
	call	CNAME(memcpy)

	mov	%rbx, %rax		# return (res)
	pop	%rbx
	leave
	ret
ARCHEND(__memccpy, scalar)

ARCHENTRY(__memccpy, baseline)
	sub		$1, %rcx		# RCX refers to last character in buffer
	jb		.L0			# go to special code path if len was 0

	movd		%edx, %xmm4
	mov		%rcx, %rdx
	punpcklbw	%xmm4, %xmm4		# c -> cc
	mov		%esi, %ecx
	punpcklwd	%xmm4, %xmm4		# cc -> cccc
	mov		%rsi, %r9		# stash a copy of the source pointer for later
	pshufd		$0, %xmm4, %xmm4	# cccc -> cccccccccccccccc
	and		$~0xf, %rsi
	movdqa		%xmm4, %xmm1
	pcmpeqb		(%rsi), %xmm1		# c found in head?
	and		$0xf, %ecx
	mov		$-1, %eax
	pmovmskb	%xmm1, %r8d
	lea		-32(%rcx), %r11
	shl		%cl, %eax		# mask of bytes in the string
	add		%rdx, %r11		# distance from alignment boundary - 32
	jnc		.Lrunt			# jump if buffer length is 32 or less

	and		%r8d, %eax
	jz		0f			# match (or induced match) found?

	/* match in first chunk */
	tzcnt		%eax, %edx		# where is c?
	sub		%ecx, %edx		# ... from the beginning of the string?
	lea		1(%rdi, %rdx, 1), %rax	# return value
	jmp		.L0116

0:	movdqa		16(%rsi), %xmm3		# load second string chunk
	movdqu		(%r9), %xmm2		# load unaligned string head
	movdqa		%xmm4, %xmm1
	pcmpeqb		%xmm3, %xmm1		# c found in second chunk?

	/* process second chunk */
	pmovmskb	%xmm1, %eax
	test		%eax, %eax
	jz		0f

	/* match in second chunk */
	tzcnt		%eax, %edx		# where is c?
	sub		$16, %ecx
	sub		%ecx, %edx		# adjust for alignment offset
	lea		1(%rdi, %rdx, 1), %rax	# return value
	jmp		.L0132

	/* c not found in second chunk: prepare for main loop */
0:	movdqa		32(%rsi), %xmm0		# load next string chunk
	movdqa		%xmm4, %xmm1
	movdqu		%xmm2, (%rdi)		# deposit head into buffer
	sub		%rcx, %rdi		# adjust RDI to correspond to RSI
	mov		%r11, %rdx
	movdqu		%xmm3, 16(%rdi)		# deposit second chunk
	sub		%rsi, %rdi		# express RDI as distance from RSI
	add		$32, %rsi		# advance RSI past first two chunks
	sub		$16, %rdx		# enough left for another round?
	jb		1f

	/* main loop unrolled twice */
	ALIGN_TEXT
0:	pcmpeqb		%xmm0, %xmm1		# c encountered?
	pmovmskb	%xmm1, %eax
	test		%eax, %eax
	jnz		3f

	movdqu		%xmm0, (%rsi, %rdi)
	movdqa		16(%rsi), %xmm0		# load next string chunk
	movdqa		%xmm4, %xmm1
	cmp		$16, %rdx		# more than a full chunk left?
	jb		2f

	add		$32, %rsi		# advance pointers to next chunk
	pcmpeqb		%xmm0, %xmm1		# c encountered?
	pmovmskb	%xmm1, %eax
	test		%eax, %eax
	jnz		4f

	movdqu		%xmm0, -16(%rsi, %rdi)
	movdqa		(%rsi), %xmm0		# load next string chunk
	movdqa		%xmm4, %xmm1
	sub		$32, %rdx
	jae		0b

1:	sub		$16, %rsi		# undo second advancement
	add		$16, %edx

	/* 1--16 bytes left in the buffer but string has not ended yet */
2:	pcmpeqb		%xmm1, %xmm0		# c encountered?
	pmovmskb	%xmm0, %r8d
	mov		%r8d, %ecx
	bts		%edx, %r8d		# treat end of buffer as end of string
	tzcnt		%r8d, %r8d		# find tail length
	add		%rsi, %rdi		# restore RDI
	movdqu		1(%rsi, %r8, 1), %xmm0	# load string tail
	movdqu		%xmm0, 1(%rdi, %r8, 1)	# store string tail
	lea		17(%rdi, %r8, 1), %rsi	# return value if terminator encountered
	xor		%eax, %eax		# return value if no terminator encountered
	bt		%r8d, %ecx		# terminator encountered inside buffer?
	cmovc		%rsi, %rax		# if yes, return pointer, else NULL
	ret

4:	sub		$16, %rsi		# undo second advancement

	/* terminator found and buffer has not ended yet */
3:	tzcnt		%eax, %eax		# find length of string tail
	movdqu		-15(%rsi, %rax, 1), %xmm0 # load string tail (incl. c)
	add		%rsi, %rdi		# restore destination pointer
	movdqu		%xmm0, -15(%rdi, %rax, 1) # store string tail (incl. c)
	lea		1(%rdi, %rax, 1), %rax	# compute return value
	ret

	/* buffer is 1--32 bytes in size */
	ALIGN_TEXT
.Lrunt:	add		$32, %r11d		# undo earlier decrement
	mov		%r8d, %r10d		# keep a copy of the original match mask
	bts		%r11d, %r8d		# induce match at buffer end
	and		%ax, %r8w		# is there a match in the first 16 bytes?
	jnz		0f			# if yes, skip looking at second chunk

	pcmpeqb		16(%rsi), %xmm4		# check for match in second chunk
	pmovmskb	%xmm4, %r8d
	shl		$16, %r8d		# place second chunk matches in bits 16--31
	mov		%r8d, %r10d		# keep a copy of the original match mask
	bts		%r11d, %r8d		# induce a match at buffer end

0:	xor		%eax, %eax		# return value if terminator not found
	tzcnt		%r8d, %edx		# find string/buffer length from alignment boundary
	lea		1(%rdi, %rdx, 1), %r8	# return value if terminator found + rcx
	sub		%rcx, %r8
	bt		%edx, %r10d		# was the terminator present?
	cmovc		%r8, %rax		# if yes, return pointer, else NULL
	sub		%ecx, %edx		# find actual string/buffer length

	ALIGN_TEXT
.L0132:	cmp		$16, %rdx		# at least 17 bytes to copy?
	jb		.L0116

	/* copy 17--32 bytes */
	movdqu		(%r9), %xmm0		# load first 16 bytes
	movdqu		-15(%r9, %rdx, 1), %xmm1 # load last 16 bytes
	movdqu		%xmm0, (%rdi)
	movdqu		%xmm1, -15(%rdi, %rdx, 1)
	ret

	/* process strings of 1--16 bytes (rdx: min(buflen, srclen), rax: srclen) */
	ALIGN_TEXT
.L0116:	cmp		$8, %rdx		# at least 9 bytes to copy?
	jae		.L0916

	cmp		$4, %rdx		# at least 5 bytes to copy?
	jae		.L0508

	cmp		$2, %rdx		# at least 3 bytes to copy?
	jae		.L0304

	/* copy one or two bytes */
	movzbl		(%r9), %ecx		# load first byte from src
	movzbl		(%r9, %rdx, 1), %esi	# load last byte from src
	mov		%cl, (%rdi)		# deposit into destination
	mov		%sil, (%rdi, %rdx, 1)
	ret

.L0304:	movzwl		(%r9), %ecx
	movzwl		-1(%r9, %rdx, 1), %esi
	mov		%cx, (%rdi)
	mov		%si, -1(%rdi, %rdx, 1)
	ret

.L0508:	mov		(%r9), %ecx
	mov		-3(%r9, %rdx, 1), %esi
	mov		%ecx, (%rdi)
	mov		%esi, -3(%rdi, %rdx, 1)
	ret

.L0916:	mov		(%r9), %rcx
	mov		-7(%r9, %rdx, 1), %rsi
	mov		%rcx, (%rdi)
	mov		%rsi, -7(%rdi, %rdx, 1)
	ret

	/* length zero destination: return null pointer */
.L0:	xor		%eax, %eax
	ret
ARCHEND(__memccpy, baseline)

	.section .note.GNU-stack,"",%progbits