1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
use core::mem;

// The following ~400 lines of code exists for exactly one purpose, which is
// to optimize this code:
//
//     byte_slice.iter().position(|&b| b > 0x7F).unwrap_or(byte_slice.len())
//
// Yes... Overengineered is a word that comes to mind, but this is effectively
// a very similar problem to memchr, and virtually nobody has been able to
// resist optimizing the crap out of that (except for perhaps the BSD and MUSL
// folks). In particular, this routine makes a very common case (ASCII) very
// fast, which seems worth it. We do stop short of adding AVX variants of the
// code below in order to retain our sanity and also to avoid needing to deal
// with runtime target feature detection. RESIST!
//
// In order to understand the SIMD version below, it would be good to read this
// comment describing how my memchr routine works:
// https://github.com/BurntSushi/rust-memchr/blob/b0a29f267f4a7fad8ffcc8fe8377a06498202883/src/x86/sse2.rs#L19-L106
//
// The primary difference with memchr is that for ASCII, we can do a bit less
// work. In particular, we don't need to detect the presence of a specific
// byte, but rather, whether any byte has its most significant bit set. That
// means we can effectively skip the _mm_cmpeq_epi8 step and jump straight to
// _mm_movemask_epi8.

#[cfg(any(test, miri, not(target_arch = "x86_64")))]
const USIZE_BYTES: usize = mem::size_of::<usize>();
#[cfg(any(test, miri, not(target_arch = "x86_64")))]
const FALLBACK_LOOP_SIZE: usize = 2 * USIZE_BYTES;

// This is a mask where the most significant bit of each byte in the usize
// is set. We test this bit to determine whether a character is ASCII or not.
// Namely, a single byte is regarded as an ASCII codepoint if and only if it's
// most significant bit is not set.
#[cfg(any(test, miri, not(target_arch = "x86_64")))]
const ASCII_MASK_U64: u64 = 0x8080808080808080;
#[cfg(any(test, miri, not(target_arch = "x86_64")))]
const ASCII_MASK: usize = ASCII_MASK_U64 as usize;

/// Returns the index of the first non ASCII byte in the given slice.
///
/// If slice only contains ASCII bytes, then the length of the slice is
/// returned.
pub fn first_non_ascii_byte(slice: &[u8]) -> usize {
    #[cfg(any(miri, not(target_arch = "x86_64")))]
    {
        first_non_ascii_byte_fallback(slice)
    }

    #[cfg(all(not(miri), target_arch = "x86_64"))]
    {
        first_non_ascii_byte_sse2(slice)
    }
}

#[cfg(any(test, miri, not(target_arch = "x86_64")))]
fn first_non_ascii_byte_fallback(slice: &[u8]) -> usize {
    let align = USIZE_BYTES - 1;
    let start_ptr = slice.as_ptr();
    let end_ptr = slice[slice.len()..].as_ptr();
    let mut ptr = start_ptr;

    unsafe {
        if slice.len() < USIZE_BYTES {
            return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr);
        }

        let chunk = read_unaligned_usize(ptr);
        let mask = chunk & ASCII_MASK;
        if mask != 0 {
            return first_non_ascii_byte_mask(mask);
        }

        ptr = ptr_add(ptr, USIZE_BYTES - (start_ptr as usize & align));
        debug_assert!(ptr > start_ptr);
        debug_assert!(ptr_sub(end_ptr, USIZE_BYTES) >= start_ptr);
        if slice.len() >= FALLBACK_LOOP_SIZE {
            while ptr <= ptr_sub(end_ptr, FALLBACK_LOOP_SIZE) {
                debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);

                let a = *(ptr as *const usize);
                let b = *(ptr_add(ptr, USIZE_BYTES) as *const usize);
                if (a | b) & ASCII_MASK != 0 {
                    // What a kludge. We wrap the position finding code into
                    // a non-inlineable function, which makes the codegen in
                    // the tight loop above a bit better by avoiding a
                    // couple extra movs. We pay for it by two additional
                    // stores, but only in the case of finding a non-ASCII
                    // byte.
                    #[inline(never)]
                    unsafe fn findpos(
                        start_ptr: *const u8,
                        ptr: *const u8,
                    ) -> usize {
                        let a = *(ptr as *const usize);
                        let b = *(ptr_add(ptr, USIZE_BYTES) as *const usize);

                        let mut at = sub(ptr, start_ptr);
                        let maska = a & ASCII_MASK;
                        if maska != 0 {
                            return at + first_non_ascii_byte_mask(maska);
                        }

                        at += USIZE_BYTES;
                        let maskb = b & ASCII_MASK;
                        debug_assert!(maskb != 0);
                        return at + first_non_ascii_byte_mask(maskb);
                    }
                    return findpos(start_ptr, ptr);
                }
                ptr = ptr_add(ptr, FALLBACK_LOOP_SIZE);
            }
        }
        first_non_ascii_byte_slow(start_ptr, end_ptr, ptr)
    }
}

#[cfg(all(not(miri), target_arch = "x86_64"))]
fn first_non_ascii_byte_sse2(slice: &[u8]) -> usize {
    use core::arch::x86_64::*;

    const VECTOR_SIZE: usize = mem::size_of::<__m128i>();
    const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
    const VECTOR_LOOP_SIZE: usize = 4 * VECTOR_SIZE;

    let start_ptr = slice.as_ptr();
    let end_ptr = slice[slice.len()..].as_ptr();
    let mut ptr = start_ptr;

    unsafe {
        if slice.len() < VECTOR_SIZE {
            return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr);
        }

        let chunk = _mm_loadu_si128(ptr as *const __m128i);
        let mask = _mm_movemask_epi8(chunk);
        if mask != 0 {
            return mask.trailing_zeros() as usize;
        }

        ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
        debug_assert!(ptr > start_ptr);
        debug_assert!(end_ptr.sub(VECTOR_SIZE) >= start_ptr);
        if slice.len() >= VECTOR_LOOP_SIZE {
            while ptr <= ptr_sub(end_ptr, VECTOR_LOOP_SIZE) {
                debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);

                let a = _mm_load_si128(ptr as *const __m128i);
                let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
                let c =
                    _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
                let d =
                    _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);

                let or1 = _mm_or_si128(a, b);
                let or2 = _mm_or_si128(c, d);
                let or3 = _mm_or_si128(or1, or2);
                if _mm_movemask_epi8(or3) != 0 {
                    let mut at = sub(ptr, start_ptr);
                    let mask = _mm_movemask_epi8(a);
                    if mask != 0 {
                        return at + mask.trailing_zeros() as usize;
                    }

                    at += VECTOR_SIZE;
                    let mask = _mm_movemask_epi8(b);
                    if mask != 0 {
                        return at + mask.trailing_zeros() as usize;
                    }

                    at += VECTOR_SIZE;
                    let mask = _mm_movemask_epi8(c);
                    if mask != 0 {
                        return at + mask.trailing_zeros() as usize;
                    }

                    at += VECTOR_SIZE;
                    let mask = _mm_movemask_epi8(d);
                    debug_assert!(mask != 0);
                    return at + mask.trailing_zeros() as usize;
                }
                ptr = ptr_add(ptr, VECTOR_LOOP_SIZE);
            }
        }
        while ptr <= end_ptr.sub(VECTOR_SIZE) {
            debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);

            let chunk = _mm_loadu_si128(ptr as *const __m128i);
            let mask = _mm_movemask_epi8(chunk);
            if mask != 0 {
                return sub(ptr, start_ptr) + mask.trailing_zeros() as usize;
            }
            ptr = ptr.add(VECTOR_SIZE);
        }
        first_non_ascii_byte_slow(start_ptr, end_ptr, ptr)
    }
}

#[inline(always)]
unsafe fn first_non_ascii_byte_slow(
    start_ptr: *const u8,
    end_ptr: *const u8,
    mut ptr: *const u8,
) -> usize {
    debug_assert!(start_ptr <= ptr);
    debug_assert!(ptr <= end_ptr);

    while ptr < end_ptr {
        if *ptr > 0x7F {
            return sub(ptr, start_ptr);
        }
        ptr = ptr.offset(1);
    }
    sub(end_ptr, start_ptr)
}

/// Compute the position of the first ASCII byte in the given mask.
///
/// The mask should be computed by `chunk & ASCII_MASK`, where `chunk` is
/// 8 contiguous bytes of the slice being checked where *at least* one of those
/// bytes is not an ASCII byte.
///
/// The position returned is always in the inclusive range [0, 7].
#[cfg(any(test, miri, not(target_arch = "x86_64")))]
fn first_non_ascii_byte_mask(mask: usize) -> usize {
    #[cfg(target_endian = "little")]
    {
        mask.trailing_zeros() as usize / 8
    }
    #[cfg(target_endian = "big")]
    {
        mask.leading_zeros() as usize / 8
    }
}

/// Increment the given pointer by the given amount.
unsafe fn ptr_add(ptr: *const u8, amt: usize) -> *const u8 {
    debug_assert!(amt < ::core::isize::MAX as usize);
    ptr.offset(amt as isize)
}

/// Decrement the given pointer by the given amount.
unsafe fn ptr_sub(ptr: *const u8, amt: usize) -> *const u8 {
    debug_assert!(amt < ::core::isize::MAX as usize);
    ptr.offset((amt as isize).wrapping_neg())
}

#[cfg(any(test, miri, not(target_arch = "x86_64")))]
unsafe fn read_unaligned_usize(ptr: *const u8) -> usize {
    use core::ptr;

    let mut n: usize = 0;
    ptr::copy_nonoverlapping(ptr, &mut n as *mut _ as *mut u8, USIZE_BYTES);
    n
}

/// Subtract `b` from `a` and return the difference. `a` should be greater than
/// or equal to `b`.
fn sub(a: *const u8, b: *const u8) -> usize {
    debug_assert!(a >= b);
    (a as usize) - (b as usize)
}

#[cfg(test)]
mod tests {
    use super::*;

    // Our testing approach here is to try and exhaustively test every case.
    // This includes the position at which a non-ASCII byte occurs in addition
    // to the alignment of the slice that we're searching.

    #[test]
    fn positive_fallback_forward() {
        for i in 0..517 {
            let s = "a".repeat(i);
            assert_eq!(
                i,
                first_non_ascii_byte_fallback(s.as_bytes()),
                "i: {:?}, len: {:?}, s: {:?}",
                i,
                s.len(),
                s
            );
        }
    }

    #[test]
    #[cfg(target_arch = "x86_64")]
    #[cfg(not(miri))]
    fn positive_sse2_forward() {
        for i in 0..517 {
            let b = "a".repeat(i).into_bytes();
            assert_eq!(b.len(), first_non_ascii_byte_sse2(&b));
        }
    }

    #[test]
    #[cfg(not(miri))]
    fn negative_fallback_forward() {
        for i in 0..517 {
            for align in 0..65 {
                let mut s = "a".repeat(i);
                s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃");
                let s = s.get(align..).unwrap_or("");
                assert_eq!(
                    i.saturating_sub(align),
                    first_non_ascii_byte_fallback(s.as_bytes()),
                    "i: {:?}, align: {:?}, len: {:?}, s: {:?}",
                    i,
                    align,
                    s.len(),
                    s
                );
            }
        }
    }

    #[test]
    #[cfg(target_arch = "x86_64")]
    #[cfg(not(miri))]
    fn negative_sse2_forward() {
        for i in 0..517 {
            for align in 0..65 {
                let mut s = "a".repeat(i);
                s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃");
                let s = s.get(align..).unwrap_or("");
                assert_eq!(
                    i.saturating_sub(align),
                    first_non_ascii_byte_sse2(s.as_bytes()),
                    "i: {:?}, align: {:?}, len: {:?}, s: {:?}",
                    i,
                    align,
                    s.len(),
                    s
                );
            }
        }
    }
}