1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
|
/*-
* Written by Mateusz Guzik <mjg@freebsd.org>
* Copyright (c) 2023 The FreeBSD Foundation
*
* Portions of this software were developed by Robert Clausecker
* <fuz@FreeBSD.org> under sponsorship from the FreeBSD Foundation.
*
* Public domain.
*/
#include <machine/asm.h>
#include "amd64_archlevel.h"
/*
* Note: this routine was written with kernel use in mind (read: no simd),
* it is only present in userspace as a temporary measure until something
* better gets imported.
*/
#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
ARCHFUNCS(strlen)
ARCHFUNC(strlen, scalar)
ARCHFUNC(strlen, baseline)
ENDARCHFUNCS(strlen)
/*
* strlen(string)
* %rdi
*
* Uses the ((x - 0x01....01) & ~x & 0x80....80) trick.
*
* 0x01....01 is replaced with 0x0 - 0x01....01 so that it can be added
* with leaq.
*
* For a description see either:
* - "Hacker's Delight" by Henry S. Warren, Jr.
* - "Optimizing subroutines in assembly language: An optimization guide for x86 platforms"
* by Agner Fog
*
* The latter contains a 32-bit variant of the same algorithm coded in assembly for i386.
*/
ARCHENTRY(strlen, scalar)
movabsq $0xfefefefefefefeff,%r8
movabsq $0x8080808080808080,%r9
movq %rdi,%r10
movq %rdi,%rcx
testb $7,%dil
jz 2f
/*
* Handle misaligned reads: align to 8 and fill
* the spurious bytes.
*/
andq $~7,%rdi
movq (%rdi),%r11
shlq $3,%rcx
movq $-1,%rdx
shlq %cl,%rdx
notq %rdx
orq %rdx,%r11
leaq (%r11,%r8),%rcx
notq %r11
andq %r11,%rcx
andq %r9,%rcx
jnz 3f
/*
* Main loop.
*/
ALIGN_TEXT
1:
leaq 8(%rdi),%rdi
2:
movq (%rdi),%r11
leaq (%r11,%r8),%rcx
notq %r11
andq %r11,%rcx
andq %r9,%rcx
jz 1b
3:
bsfq %rcx,%rcx
shrq $3,%rcx
leaq (%rcx,%rdi),%rax
subq %r10,%rax
ret
ARCHEND(strlen, scalar)
ARCHENTRY(strlen, baseline)
mov %rdi, %rcx
pxor %xmm1, %xmm1
and $~0xf, %rdi # align string
pcmpeqb (%rdi), %xmm1 # compare head (with junk before string)
mov %rcx, %rsi # string pointer copy for later
and $0xf, %ecx # amount of bytes rdi is past 16 byte alignment
pmovmskb %xmm1, %eax
add $32, %rdi # advance to next iteration
shr %cl, %eax # clear out matches in junk bytes
test %eax, %eax # any match? (can't use ZF from SHR as CL=0 is possible)
jnz 2f
ALIGN_TEXT
1: pxor %xmm1, %xmm1
pcmpeqb -16(%rdi), %xmm1 # find NUL bytes
pmovmskb %xmm1, %eax
test %eax, %eax # were any NUL bytes present?
jnz 3f
/* the same unrolled once more */
pxor %xmm1, %xmm1
pcmpeqb (%rdi), %xmm1
pmovmskb %xmm1, %eax
add $32, %rdi # advance to next iteration
test %eax, %eax
jz 1b
/* match found in loop body */
sub $16, %rdi # undo half the advancement
3: tzcnt %eax, %eax # find the first NUL byte
sub %rsi, %rdi # string length until beginning of (%rdi)
lea -16(%rdi, %rax, 1), %rax # that plus loc. of NUL byte: full string length
ret
/* match found in head */
2: tzcnt %eax, %eax # compute string length
ret
ARCHEND(strlen, baseline)
.section .note.GNU-stack,"",%progbits
|