1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
|
/*-
* Copyright (c) 2023, The FreeBSD Foundation
*
* SPDX-License-Expression: BSD-2-Clause
*
* Portions of this software were developed by Robert Clausecker
* <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
*
* Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcpy.S
* written by J.T. Conklin <jtc@acorntoolworks.com> and
* adapted by Guillaume Morin <guillaume@morinfr.org> to implement stpcpy
* that was originally dedicated to the public domain
*/
#include <machine/asm.h>
#include "amd64_archlevel.h"
#define ALIGN_TEXT .p2align 4, 0x90
.weak stpcpy
.set stpcpy, __stpcpy
ARCHFUNCS(__stpcpy)
ARCHFUNC(__stpcpy, scalar)
ARCHFUNC(__stpcpy, baseline)
ENDARCHFUNCS(__stpcpy)
/*
* This stpcpy implementation copies a byte at a time until the
* source pointer is aligned to a word boundary, it then copies by
* words until it finds a word containing a zero byte, and finally
* copies by bytes until the end of the string is reached.
*
* While this may result in unaligned stores if the source and
* destination pointers are unaligned with respect to each other,
* it is still faster than either byte copies or the overhead of
* an implementation suitable for machines with strict alignment
* requirements.
*/
ARCHENTRY(__stpcpy, scalar)
movabsq $0x0101010101010101,%r8
movabsq $0x8080808080808080,%r9
/*
* Align source to a word boundary.
* Consider unrolling loop?
*/
.Lalign:
testb $7,%sil
je .Lword_aligned
movb (%rsi),%dl
incq %rsi
movb %dl,(%rdi)
incq %rdi
testb %dl,%dl
jne .Lalign
movq %rdi,%rax
dec %rax
ret
ALIGN_TEXT
.Lloop:
movq %rdx,(%rdi)
addq $8,%rdi
.Lword_aligned:
movq (%rsi),%rdx
movq %rdx,%rcx
addq $8,%rsi
subq %r8,%rcx
testq %r9,%rcx
je .Lloop
/*
* In rare cases, the above loop may exit prematurely. We must
* return to the loop if none of the bytes in the word equal 0.
*/
movb %dl,(%rdi)
testb %dl,%dl /* 1st byte == 0? */
je .Ldone
incq %rdi
shrq $8,%rdx
movb %dl,(%rdi)
testb %dl,%dl /* 2nd byte == 0? */
je .Ldone
incq %rdi
shrq $8,%rdx
movb %dl,(%rdi)
testb %dl,%dl /* 3rd byte == 0? */
je .Ldone
incq %rdi
shrq $8,%rdx
movb %dl,(%rdi)
testb %dl,%dl /* 4th byte == 0? */
je .Ldone
incq %rdi
shrq $8,%rdx
movb %dl,(%rdi)
testb %dl,%dl /* 5th byte == 0? */
je .Ldone
incq %rdi
shrq $8,%rdx
movb %dl,(%rdi)
testb %dl,%dl /* 6th byte == 0? */
je .Ldone
incq %rdi
shrq $8,%rdx
movb %dl,(%rdi)
testb %dl,%dl /* 7th byte == 0? */
je .Ldone
incq %rdi
shrq $8,%rdx
movb %dl,(%rdi)
incq %rdi
testb %dl,%dl /* 8th byte == 0? */
jne .Lword_aligned
decq %rdi
.Ldone:
movq %rdi,%rax
ret
ARCHEND(__stpcpy, scalar)
ARCHENTRY(__stpcpy, baseline)
mov %esi, %ecx
mov %rdi, %rdx
sub %rsi, %rdi # express destination as distance to surce
and $~0xf, %rsi # align source to 16 byte
movdqa (%rsi), %xmm0 # head of string with junk before
pxor %xmm1, %xmm1
and $0xf, %ecx # misalignment in bytes
pcmpeqb %xmm1, %xmm0 # NUL byte present?
pmovmskb %xmm0, %eax
shr %cl, %eax # clear out matches in junk bytes
bsf %eax, %eax # find match if any
jnz .Lrunt
/* first normal iteration: write head back if it succeeds */
movdqa 16(%rsi), %xmm0 # 16 bytes of current iteration
movdqu (%rsi, %rcx, 1), %xmm2 # first 16 bytes of the string
pcmpeqb %xmm0, %xmm1 # NUL byte present?
pmovmskb %xmm1, %eax
test %eax, %eax # find match if any
jnz .Lshorty
movdqu %xmm2, (%rdx) # store beginning of string
/* main loop, unrolled twice */
ALIGN_TEXT
0: movdqa 32(%rsi), %xmm2 # load current iteraion
movdqu %xmm0, 16(%rsi, %rdi, 1) # write back previous iteraion
pxor %xmm1, %xmm1
add $32, %rsi
pcmpeqb %xmm2, %xmm1 # NUL byte present?
pmovmskb %xmm1, %eax
test %eax, %eax
jnz 1f
movdqa 16(%rsi), %xmm0 # load current iteraion
movdqu %xmm2, (%rsi, %rdi, 1) # write back previous iteraion
pxor %xmm1, %xmm1
pcmpeqb %xmm0, %xmm1 # NUL byte present?
pmovmskb %xmm1, %eax
test %eax, %eax
jz 0b
/* end of string after main loop has iterated */
add $16, %rsi # advance rsi to second unrolled half
1: tzcnt %eax, %eax # find location of match
# (behaves as bsf on pre-x86-64-v3 CPUs)
add %rsi, %rax # point to NUL byte
movdqu -15(%rax), %xmm0 # last 16 bytes of string
movdqu %xmm0, -15(%rax, %rdi, 1) # copied to destination
add %rdi, %rax # point to destination's NUL byte
ret
/* NUL encountered in second iteration */
.Lshorty:
tzcnt %eax, %eax
add $16, %eax # account for length of first iteration
sub %ecx, %eax # but not the parts before the string
/* NUL encountered in first iteration */
.Lrunt: lea 1(%rax), %edi # string length including NUL byte
add %rcx, %rsi # point to beginning of string
add %rdx, %rax # point to NUL byte
/* transfer 16--32 bytes */
.L1632: cmp $16, %edi
jb .L0815
movdqu -16(%rsi, %rdi, 1), %xmm0 # load last 16 bytes
movdqu %xmm2, (%rdx) # store first 16 bytes
movdqu %xmm0, -15(%rax) # store last 16 bytes
ret
/* transfer 8--15 bytes */
.L0815: cmp $8, %edi
jb .L0407
mov (%rsi), %rcx # load first 8 bytes
mov -8(%rsi, %rdi, 1), %rdi # load last 8 bytes
mov %rcx, (%rdx) # store to dst
mov %rdi, -7(%rax) # dito
ret
/* transfer 4--7 bytes */
.L0407: cmp $4, %edi
jb .L0203
mov (%rsi), %ecx
mov -4(%rsi, %rdi, 1), %edi
mov %ecx, (%rdx)
mov %edi, -3(%rax)
ret
/* transfer 2--3 bytes */
.L0203: cmp $2, %edi
jb .L0101
movzwl (%rsi), %ecx
mov %cx, (%rdx) # store first two bytes
/* transfer 0 bytes (last byte is always NUL) */
.L0101: movb $0, (%rax) # store terminating NUL byte
ret
ARCHEND(__stpcpy, baseline)
.section .note.GNU-stack,"",%progbits
|