summaryrefslogtreecommitdiff
path: root/lib/libc/amd64/string/stpcpy.S
blob: 59358e3245a8ac2ec11a1fd017a2b9dfeaaa013f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
/*-
 * Copyright (c) 2023, The FreeBSD Foundation
 *
 * SPDX-License-Expression: BSD-2-Clause
 *
 * Portions of this software were developed by Robert Clausecker
 * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
 *
 * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S
 * written by J.T. Conklin <jtc@acorntoolworks.com> and
 * adapted by Guillaume Morin <guillaume@morinfr.org> to implement stpcpy
 * that was originally dedicated to the public domain
 */

#include <machine/asm.h>

#include "amd64_archlevel.h"

#define ALIGN_TEXT	.p2align 4, 0x90

	.weak stpcpy
	.set stpcpy, __stpcpy
ARCHFUNCS(__stpcpy)
	ARCHFUNC(__stpcpy, scalar)
	ARCHFUNC(__stpcpy, baseline)
ENDARCHFUNCS(__stpcpy)

/*
 * This stpcpy implementation copies a byte at a time until the
 * source pointer is aligned to a word boundary, it then copies by
 * words until it finds a word containing a zero byte, and finally
 * copies by bytes until the end of the string is reached.
 *
 * While this may result in unaligned stores if the source and
 * destination pointers are unaligned with respect to each other,
 * it is still faster than either byte copies or the overhead of
 * an implementation suitable for machines with strict alignment
 * requirements.
 */

ARCHENTRY(__stpcpy, scalar)
	movabsq $0x0101010101010101,%r8
	movabsq $0x8080808080808080,%r9

	/*
	 * Align source to a word boundary.
	 * Consider unrolling loop?
	 */
.Lalign:
	testb	$7,%sil
	je	.Lword_aligned
	movb	(%rsi),%dl
	incq	%rsi
	movb	%dl,(%rdi)
	incq	%rdi
	testb	%dl,%dl
	jne	.Lalign
	movq	%rdi,%rax
	dec	%rax
	ret

	ALIGN_TEXT
.Lloop:
	movq	%rdx,(%rdi)
	addq	$8,%rdi
.Lword_aligned:
	movq	(%rsi),%rdx
	movq	%rdx,%rcx
	addq	$8,%rsi
	subq	%r8,%rcx
	testq	%r9,%rcx
	je	.Lloop

	/*
	 * In rare cases, the above loop may exit prematurely. We must
	 * return to the loop if none of the bytes in the word equal 0.
	 */

	movb	%dl,(%rdi)
	testb	%dl,%dl		/* 1st byte == 0? */
	je	.Ldone
	incq	%rdi

	shrq	$8,%rdx
	movb	%dl,(%rdi)
	testb	%dl,%dl		/* 2nd byte == 0? */
	je	.Ldone
	incq	%rdi

	shrq	$8,%rdx
	movb	%dl,(%rdi)
	testb	%dl,%dl		/* 3rd byte == 0? */
	je	.Ldone
	incq	%rdi

	shrq	$8,%rdx
	movb	%dl,(%rdi)
	testb	%dl,%dl		/* 4th byte == 0? */
	je	.Ldone
	incq	%rdi

	shrq	$8,%rdx
	movb	%dl,(%rdi)
	testb	%dl,%dl		/* 5th byte == 0? */
	je	.Ldone
	incq	%rdi

	shrq	$8,%rdx
	movb	%dl,(%rdi)
	testb	%dl,%dl		/* 6th byte == 0? */
	je	.Ldone
	incq	%rdi

	shrq	$8,%rdx
	movb	%dl,(%rdi)
	testb	%dl,%dl		/* 7th byte == 0? */
	je	.Ldone
	incq	%rdi

	shrq	$8,%rdx
	movb	%dl,(%rdi)
	incq	%rdi
	testb	%dl,%dl		/* 8th byte == 0? */
	jne	.Lword_aligned
	decq	%rdi

.Ldone:
	movq	%rdi,%rax
	ret
ARCHEND(__stpcpy, scalar)

ARCHENTRY(__stpcpy, baseline)
	mov	%esi, %ecx
	mov	%rdi, %rdx
	sub	%rsi, %rdi		# express destination as distance to surce
	and	$~0xf, %rsi		# align source to 16 byte
	movdqa	(%rsi), %xmm0		# head of string with junk before
	pxor	%xmm1, %xmm1
	and	$0xf, %ecx		# misalignment in bytes
	pcmpeqb	%xmm1, %xmm0		# NUL byte present?
	pmovmskb %xmm0, %eax
	shr	%cl, %eax		# clear out matches in junk bytes
	bsf	%eax, %eax		# find match if any
	jnz	.Lrunt

	/* first normal iteration: write head back if it succeeds */
	movdqa	16(%rsi), %xmm0		# 16 bytes of current iteration
	movdqu	(%rsi, %rcx, 1), %xmm2	# first 16 bytes of the string
	pcmpeqb	%xmm0, %xmm1		# NUL byte present?
	pmovmskb %xmm1, %eax
	test	%eax, %eax		# find match if any
	jnz	.Lshorty

	movdqu	%xmm2, (%rdx)		# store beginning of string

	/* main loop, unrolled twice */
	ALIGN_TEXT
0:	movdqa	32(%rsi), %xmm2		# load current iteraion
	movdqu	%xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion
	pxor	%xmm1, %xmm1
	add	$32, %rsi
	pcmpeqb	%xmm2, %xmm1		# NUL byte present?
	pmovmskb %xmm1, %eax
	test	%eax, %eax
	jnz	1f

	movdqa	16(%rsi), %xmm0		# load current iteraion
	movdqu	%xmm2, (%rsi, %rdi, 1)	# write back previous iteraion
	pxor	%xmm1, %xmm1
	pcmpeqb	%xmm0, %xmm1		# NUL byte present?
	pmovmskb %xmm1, %eax
	test	%eax, %eax
	jz	0b

	/* end of string after main loop has iterated */
	add	$16, %rsi		# advance rsi to second unrolled half
1:	tzcnt	%eax, %eax		# find location of match
					# (behaves as bsf on pre-x86-64-v3 CPUs)
	add	%rsi, %rax		# point to NUL byte
	movdqu	-15(%rax), %xmm0	# last 16 bytes of string
	movdqu	%xmm0, -15(%rax, %rdi, 1) # copied to destination
	add	%rdi, %rax		# point to destination's NUL byte
	ret

	/* NUL encountered in second iteration */
.Lshorty:
	tzcnt	%eax, %eax
	add	$16, %eax		# account for length of first iteration
	sub	%ecx, %eax		# but not the parts before the string

	/* NUL encountered in first iteration */
.Lrunt:	lea	1(%rax), %edi		# string length including NUL byte
	add	%rcx, %rsi		# point to beginning of string
	add	%rdx, %rax		# point to NUL byte

	/* transfer 16--32 bytes */
.L1632:	cmp	$16, %edi
	jb	.L0815

	movdqu	-16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes
	movdqu	%xmm2, (%rdx)		# store first 16 bytes
	movdqu	%xmm0, -15(%rax)	# store last 16 bytes
	ret

	/* transfer 8--15 bytes */
.L0815:	cmp	$8, %edi
	jb	.L0407

	mov	(%rsi), %rcx		# load first 8 bytes
	mov	-8(%rsi, %rdi, 1), %rdi	# load last 8 bytes
	mov	%rcx, (%rdx)		# store to dst
	mov	%rdi, -7(%rax)		# dito
	ret

	/* transfer 4--7 bytes */
.L0407:	cmp	$4, %edi
	jb	.L0203

	mov	(%rsi), %ecx
	mov	-4(%rsi, %rdi, 1), %edi
	mov	%ecx, (%rdx)
	mov	%edi, -3(%rax)
	ret

	/* transfer 2--3 bytes */
.L0203:	cmp	$2, %edi
	jb	.L0101

	movzwl	(%rsi), %ecx
	mov	%cx, (%rdx)		# store first two bytes

	/* transfer 0 bytes (last byte is always NUL) */
.L0101:	movb	$0, (%rax)		# store terminating NUL byte
	ret
ARCHEND(__stpcpy, baseline)

	.section .note.GNU-stack,"",%progbits