summaryrefslogtreecommitdiff
path: root/lib/libc/amd64/string/memcmp.S
blob: dc8bcff73cb9c0d81782e7e08002b6742b6aa755 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
/*-
 * Copyright (c) 2018, 2023 The FreeBSD Foundation
 *
 * This software was developed by Mateusz Guzik <mjg@FreeBSD.org>
 * under sponsorship from the FreeBSD Foundation.
 *
 * Portions of this software were developed by Robert Clausecker
 * <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <machine/asm.h>
#include <machine/param.h>

#include "amd64_archlevel.h"

/*
 * Note: this routine was written with kernel use in mind (read: no simd),
 * it is only present in userspace as a temporary measure until something
 * better gets imported.
 */

#define ALIGN_TEXT      .p2align 4,0x90 /* 16-byte alignment, nop filled */

#ifdef BCMP
#define memcmp bcmp
#endif

ARCHFUNCS(memcmp)
	ARCHFUNC(memcmp, scalar)
	ARCHFUNC(memcmp, baseline)
ENDARCHFUNCS(memcmp)

ARCHENTRY(memcmp, scalar)
	xorl	%eax,%eax
10:
	cmpq	$16,%rdx
	ja	101632f

	cmpb	$8,%dl
	jg	100816f

	cmpb	$4,%dl
	jg	100408f

	cmpb	$2,%dl
	jge	100204f

	cmpb	$1,%dl
	jl	100000f
	movzbl	(%rdi),%eax
	movzbl	(%rsi),%r8d
	subl	%r8d,%eax
100000:
	ret

	ALIGN_TEXT
100816:
	movq	(%rdi),%r8
	movq	(%rsi),%r9
	cmpq	%r8,%r9
	jne	80f
	movq	-8(%rdi,%rdx),%r8
	movq	-8(%rsi,%rdx),%r9
	cmpq	%r8,%r9
	jne	10081608f
	ret
	ALIGN_TEXT
100408:
	movl	(%rdi),%r8d
	movl	(%rsi),%r9d
	cmpl	%r8d,%r9d
	jne	80f
	movl	-4(%rdi,%rdx),%r8d
	movl	-4(%rsi,%rdx),%r9d
	cmpl	%r8d,%r9d
	jne	10040804f
	ret
	ALIGN_TEXT
100204:
	movzwl	(%rdi),%r8d
	movzwl	(%rsi),%r9d
	cmpl	%r8d,%r9d
	jne	1f
	movzwl	-2(%rdi,%rdx),%r8d
	movzwl	-2(%rsi,%rdx),%r9d
	cmpl	%r8d,%r9d
	jne	1f
	ret
	ALIGN_TEXT
101632:
	cmpq	$32,%rdx
	ja	103200f
	movq	(%rdi),%r8
	movq	(%rsi),%r9
	cmpq	%r8,%r9
	jne	80f
	movq	8(%rdi),%r8
	movq	8(%rsi),%r9
	cmpq	%r8,%r9
	jne	10163208f
	movq	-16(%rdi,%rdx),%r8
	movq	-16(%rsi,%rdx),%r9
	cmpq	%r8,%r9
	jne	10163216f
	movq	-8(%rdi,%rdx),%r8
	movq	-8(%rsi,%rdx),%r9
	cmpq	%r8,%r9
	jne	10163224f
	ret
	ALIGN_TEXT
103200:
	movq	(%rdi),%r8
	movq	8(%rdi),%r9
	subq	(%rsi),%r8
	subq	8(%rsi),%r9
	orq	%r8,%r9
	jnz	10320000f

	movq    16(%rdi),%r8
	movq    24(%rdi),%r9
	subq    16(%rsi),%r8
	subq    24(%rsi),%r9
	orq	%r8,%r9
	jnz     10320016f

	leaq	32(%rdi),%rdi
	leaq	32(%rsi),%rsi
	subq	$32,%rdx
	cmpq	$32,%rdx
	jae	103200b
	cmpb	$0,%dl
	jne	10b
	ret

/*
 * Mismatch was found.
 */
#ifdef BCMP
	ALIGN_TEXT
10320016:
10320000:
10081608:
10163224:
10163216:
10163208:
10040804:
80:
1:
	leal	1(%eax),%eax
	ret
#else
/*
 * We need to compute the difference between strings.
 * Start with narrowing the range down (16 -> 8 -> 4 bytes).
 */
	ALIGN_TEXT
10320016:
	leaq	16(%rdi),%rdi
	leaq	16(%rsi),%rsi
10320000:
	movq	(%rdi),%r8
	movq	(%rsi),%r9
	cmpq	%r8,%r9
	jne	80f
	leaq	8(%rdi),%rdi
	leaq	8(%rsi),%rsi
	jmp	80f
	ALIGN_TEXT
10081608:
10163224:
	leaq	-8(%rdi,%rdx),%rdi
	leaq	-8(%rsi,%rdx),%rsi
	jmp	80f
	ALIGN_TEXT
10163216:
	leaq	-16(%rdi,%rdx),%rdi
	leaq	-16(%rsi,%rdx),%rsi
	jmp	80f
	ALIGN_TEXT
10163208:
	leaq	8(%rdi),%rdi
	leaq	8(%rsi),%rsi
	jmp	80f
	ALIGN_TEXT
10040804:
	leaq	-4(%rdi,%rdx),%rdi
	leaq	-4(%rsi,%rdx),%rsi
	jmp	1f

	ALIGN_TEXT
80:
	movl	(%rdi),%r8d
	movl	(%rsi),%r9d
	cmpl	%r8d,%r9d
	jne	1f
	leaq	4(%rdi),%rdi
	leaq	4(%rsi),%rsi

/*
 * We have up to 4 bytes to inspect.
 */
1:
	movzbl	(%rdi),%eax
	movzbl	(%rsi),%r8d
	cmpb	%r8b,%al
	jne	2f

	movzbl	1(%rdi),%eax
	movzbl	1(%rsi),%r8d
	cmpb	%r8b,%al
	jne	2f

	movzbl	2(%rdi),%eax
	movzbl	2(%rsi),%r8d
	cmpb	%r8b,%al
	jne	2f

	movzbl	3(%rdi),%eax
	movzbl	3(%rsi),%r8d
2:
	subl	%r8d,%eax
	ret
#endif
ARCHEND(memcmp, scalar)

ARCHENTRY(memcmp, baseline)
	cmp		$32, %rdx		# enough to permit use of the long kernel?
	ja		.Llong

	test		%rdx, %rdx		# zero bytes buffer?
	je		.L0

	/*
	 * Compare strings of 1--32 bytes.  We want to do this by
	 * loading into two xmm registers and then comparing.  To avoid
	 * crossing into unmapped pages, we either load 32 bytes from
	 * the start of the buffer or 32 bytes before its end, depending
	 * on whether there is a page boundary between the overread area
	 * or not.
	 */

	/* check for page boundaries overreads */
	lea		31(%rdi), %eax		# end of overread
	lea		31(%rsi), %r8d
	lea		-1(%rdi, %rdx, 1), %ecx	# last character in buffer
	lea		-1(%rsi, %rdx, 1), %r9d
	xor		%ecx, %eax
	xor		%r9d, %r8d
	test		$PAGE_SIZE, %eax	# are they on different pages?
	jz		0f

	/* fix up rdi */
	movdqu		-32(%rdi, %rdx, 1), %xmm0
	movdqu		-16(%rdi, %rdx, 1), %xmm1
	lea		-8(%rsp), %rdi		# end of replacement buffer
	sub		%rdx, %rdi		# start of replacement buffer
	movdqa		%xmm0, -40(%rsp)	# copy to replacement buffer
	movdqa		%xmm1, -24(%rsp)

0:	test		$PAGE_SIZE, %r8d
	jz		0f

	/* fix up rsi */
	movdqu		-32(%rsi, %rdx, 1), %xmm0
	movdqu		-16(%rsi, %rdx, 1), %xmm1
	lea		-40(%rsp), %rsi		# end of replacement buffer
	sub		%rdx, %rsi		# start of replacement buffer
	movdqa		%xmm0, -72(%rsp)	# copy to replacement buffer
	movdqa		%xmm1, -56(%rsp)

	/* load data and compare properly */
0:	movdqu		16(%rdi), %xmm1
	movdqu		16(%rsi), %xmm3
	movdqu		(%rdi), %xmm0
	movdqu		(%rsi), %xmm2
	mov		%edx, %ecx
	mov		$-1, %edx
	shl		%cl, %rdx		# ones where the buffer is not
	pcmpeqb		%xmm3, %xmm1
	pcmpeqb		%xmm2, %xmm0
	pmovmskb	%xmm1, %ecx
	pmovmskb	%xmm0, %eax
	shl		$16, %ecx
	or		%ecx, %eax		# ones where the buffers match
	or		%edx, %eax		# including where the buffer is not
	not		%eax			# ones where there is a mismatch
#ifndef BCMP
	bsf		%eax, %edx		# location of the first mismatch
	cmovz		%eax, %edx		# including if there is no mismatch
	movzbl		(%rdi, %rdx, 1), %eax	# mismatching bytes
	movzbl		(%rsi, %rdx, 1), %edx
	sub		%edx, %eax
#endif
	ret

	/* empty input */
.L0:	xor		%eax, %eax
	ret

	/* compare 33+ bytes */
	ALIGN_TEXT
.Llong:	movdqu		(%rdi), %xmm0		# load head
	movdqu		(%rsi), %xmm2
	mov		%rdi, %rcx
	sub		%rdi, %rsi		# express rsi as distance from rdi
	and		$~0xf, %rdi		# align rdi to 16 bytes
	movdqu		16(%rsi, %rdi, 1), %xmm1
	pcmpeqb		16(%rdi), %xmm1		# compare second half of this iteration
	add		%rcx, %rdx		# pointer to last byte in buffer
	jc		.Loverflow		# did this overflow?
0:	pcmpeqb		%xmm2, %xmm0
	pmovmskb	%xmm0, %eax
	xor		$0xffff, %eax		# any mismatch?
	jne		.Lmismatch_head
	add		$64, %rdi		# advance to next iteration
	jmp		1f			# and get going with the loop

	/*
	 * If we got here, a buffer length was passed to memcmp(a, b, len)
	 * such that a + len < a.  While this sort of usage is illegal,
	 * it is plausible that a caller tries to do something like
	 * memcmp(a, b, SIZE_MAX) if a and b are known to differ, intending
	 * for memcmp() to stop comparing at the first mismatch.  This
	 * behaviour is not guaranteed by any version of ISO/IEC 9899,
	 * but usually works out in practice.  Let's try to make this
	 * case work by comparing until the end of the address space.
	 */
.Loverflow:
	mov		$-1, %rdx		# compare until the end of memory
	jmp		0b

	/* process buffer 32 bytes at a time */
	ALIGN_TEXT
0:	movdqu		-32(%rsi, %rdi, 1), %xmm0
	movdqu		-16(%rsi, %rdi, 1), %xmm1
	pcmpeqb		-32(%rdi), %xmm0
	pcmpeqb		-16(%rdi), %xmm1
	add		$32, %rdi		# advance to next iteration
1:	pand		%xmm0, %xmm1		# 0xff where both halves matched
	pmovmskb	%xmm1, %eax
	cmp		$0xffff, %eax		# all bytes matched?
	jne		.Lmismatch
	cmp		%rdx, %rdi		# end of buffer reached?
	jb		0b

	/* less than 32 bytes left to compare */
	movdqu		-16(%rdx), %xmm1	# load 32 byte tail through end pointer
	movdqu		-16(%rdx, %rsi, 1), %xmm3
	movdqu		-32(%rdx), %xmm0
	movdqu		-32(%rdx, %rsi, 1), %xmm2
	pcmpeqb		%xmm3, %xmm1
	pcmpeqb		%xmm2, %xmm0
	pmovmskb	%xmm1, %ecx
	pmovmskb	%xmm0, %eax
	shl		$16, %ecx
	or		%ecx, %eax		# ones where the buffers match
	not		%eax			# ones where there is a mismatch
#ifndef BCMP
	bsf		%eax, %ecx		# location of the first mismatch
	cmovz		%eax, %ecx		# including if there is no mismatch
	add		%rcx, %rdx		# pointer to potential mismatch
	movzbl		-32(%rdx), %eax		# mismatching bytes
	movzbl		-32(%rdx, %rsi, 1), %edx
	sub		%edx, %eax
#endif
	ret

#ifdef BCMP
.Lmismatch:
	mov		$1, %eax
.Lmismatch_head:
	ret
#else /* memcmp */
.Lmismatch_head:
	tzcnt		%eax, %eax		# location of mismatch
	add		%rax, %rcx		# pointer to mismatch
	movzbl		(%rcx), %eax		# mismatching bytes
	movzbl		(%rcx, %rsi, 1), %ecx
	sub		%ecx, %eax
	ret

.Lmismatch:
	movdqu		-48(%rsi, %rdi, 1), %xmm1
	pcmpeqb		-48(%rdi), %xmm1	# reconstruct xmm1 before PAND
	pmovmskb	%xmm0, %eax		# mismatches in first 16 bytes
	pmovmskb	%xmm1, %edx		# mismatches in second 16 bytes
	shl		$16, %edx
	or		%edx, %eax		# mismatches in both
	not		%eax			# matches in both
	tzcnt		%eax, %eax		# location of mismatch
	add		%rax, %rdi		# pointer to mismatch
	movzbl		-64(%rdi), %eax		# mismatching bytes
	movzbl		-64(%rdi, %rsi, 1), %ecx
	sub		%ecx, %eax
	ret
#endif
ARCHEND(memcmp, baseline)

	.section .note.GNU-stack,"",%progbits