1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
|
/*-
* Copyright (c) 2023, The FreeBSD Foundation
*
* SPDX-License-Expression: BSD-2-Clause
*
* Portions of this software were developed by Robert Clausecker
* <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
*
* Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcat.S
* written by J.T. Conklin <jtc@acorntoolworks.com>
* that was originally dedicated to the public domain
*/
#include <machine/asm.h>
#if 0
RCSID("$NetBSD: strcat.S,v 1.4 2004/07/26 18:51:21 drochner Exp $")
#endif
#include "amd64_archlevel.h"
ARCHFUNCS(strcat)
ARCHFUNC(strcat, scalar)
ARCHFUNC(strcat, baseline)
ENDARCHFUNCS(strcat)
ARCHENTRY(strcat, scalar)
movq %rdi,%rax
movabsq $0x0101010101010101,%r8
movabsq $0x8080808080808080,%r9
/*
* Align destination to word boundary.
* Consider unrolling loop?
*/
.Lscan:
.Lscan_align:
testb $7,%dil
je .Lscan_aligned
cmpb $0,(%rdi)
je .Lcopy
incq %rdi
jmp .Lscan_align
.align 4
.Lscan_aligned:
.Lscan_loop:
movq (%rdi),%rdx
addq $8,%rdi
subq %r8,%rdx
testq %r9,%rdx
je .Lscan_loop
/*
* In rare cases, the above loop may exit prematurely. We must
* return to the loop if none of the bytes in the word equal 0.
*/
cmpb $0,-8(%rdi) /* 1st byte == 0? */
jne 1f
subq $8,%rdi
jmp .Lcopy
1: cmpb $0,-7(%rdi) /* 2nd byte == 0? */
jne 1f
subq $7,%rdi
jmp .Lcopy
1: cmpb $0,-6(%rdi) /* 3rd byte == 0? */
jne 1f
subq $6,%rdi
jmp .Lcopy
1: cmpb $0,-5(%rdi) /* 4th byte == 0? */
jne 1f
subq $5,%rdi
jmp .Lcopy
1: cmpb $0,-4(%rdi) /* 5th byte == 0? */
jne 1f
subq $4,%rdi
jmp .Lcopy
1: cmpb $0,-3(%rdi) /* 6th byte == 0? */
jne 1f
subq $3,%rdi
jmp .Lcopy
1: cmpb $0,-2(%rdi) /* 7th byte == 0? */
jne 1f
subq $2,%rdi
jmp .Lcopy
1: cmpb $0,-1(%rdi) /* 8th byte == 0? */
jne .Lscan_loop
subq $1,%rdi
/*
* Align source to a word boundary.
* Consider unrolling loop?
*/
.Lcopy:
.Lcopy_align:
testb $7,%sil
je .Lcopy_aligned
movb (%rsi),%dl
incq %rsi
movb %dl,(%rdi)
incq %rdi
testb %dl,%dl
jne .Lcopy_align
ret
.align 4
.Lcopy_loop:
movq %rdx,(%rdi)
addq $8,%rdi
.Lcopy_aligned:
movq (%rsi),%rdx
movq %rdx,%rcx
addq $8,%rsi
subq %r8,%rcx
testq %r9,%rcx
je .Lcopy_loop
/*
* In rare cases, the above loop may exit prematurely. We must
* return to the loop if none of the bytes in the word equal 0.
*/
movb %dl,(%rdi)
incq %rdi
testb %dl,%dl /* 1st byte == 0? */
je .Ldone
shrq $8,%rdx
movb %dl,(%rdi)
incq %rdi
testb %dl,%dl /* 2nd byte == 0? */
je .Ldone
shrq $8,%rdx
movb %dl,(%rdi)
incq %rdi
testb %dl,%dl /* 3rd byte == 0? */
je .Ldone
shrq $8,%rdx
movb %dl,(%rdi)
incq %rdi
testb %dl,%dl /* 4th byte == 0? */
je .Ldone
shrq $8,%rdx
movb %dl,(%rdi)
incq %rdi
testb %dl,%dl /* 5th byte == 0? */
je .Ldone
shrq $8,%rdx
movb %dl,(%rdi)
incq %rdi
testb %dl,%dl /* 6th byte == 0? */
je .Ldone
shrq $8,%rdx
movb %dl,(%rdi)
incq %rdi
testb %dl,%dl /* 7th byte == 0? */
je .Ldone
shrq $8,%rdx
movb %dl,(%rdi)
incq %rdi
testb %dl,%dl /* 8th byte == 0? */
jne .Lcopy_aligned
.Ldone:
ret
ARCHEND(strcat, scalar)
/*
* Call into strlen + strcpy if we have any SIMD at all.
* The scalar implementation above is better for the scalar
* case as it avoids the function call overhead, but pessimal
* if we could call SIMD routines instead.
*/
ARCHENTRY(strcat, baseline)
push %rbp
mov %rsp, %rbp
push %rsi
push %rbx
mov %rdi, %rbx # remember destination for later
call CNAME(strlen) # strlen(dest)
mov -8(%rbp), %rsi
lea (%rbx, %rax, 1), %rdi # dest + strlen(dest)
call CNAME(__stpcpy) # stpcpy(dest + strlen(dest), src)
mov %rbx, %rax # return dest
pop %rbx
leave
ret
ARCHEND(strcat, baseline)
.section .note.GNU-stack,"",%progbits
|