advent-of-code-rust/src/year2015/day04.rs at main · maneatingape/advent-of-code-rust · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
//! # The Ideal Stocking Stuffer
//!
//! This solution relies on brute forcing combinations as quickly as possible using an internal
//! implementation of the [`MD5`] hashing algorithm.
//!
//! Each number's hash is independent of the others, so we speed things up by using threading
//! to search in parallel in blocks of 1000 numbers at a time.
//!
//! Using the [`format!`] macro to join the secret key to the number is quite slow. To go faster
//! we reuse the same `u8` buffer, incrementing digits one at a time.
//! The numbers from 1 to 999 are handled specially.
//!
//! Interestingly, the total time to solve this problem is *extremely* sensitive to the secret key
//! provided as input. For example, my key required ~10⁷ iterations to find the answer to part two.
//! However, for unit testing, I was able to randomly find a value that takes only 455 iterations,
//! about 22,000 times faster!
//!
//! [`MD5`]: crate::util::md5
//! [`format!`]: std::format
use crate::util::md5::*;
use crate::util::thread::*;
use implementation::*;
use std::sync::atomic::{AtomicU32, Ordering};

pub struct Shared {
    prefix: String,
    iter: AtomicIter,
    first: AtomicU32,
    second: AtomicU32,
}

pub fn parse(input: &str) -> Shared {
    let shared = Shared {
        prefix: input.trim().to_owned(),
        iter: AtomicIter::new(1000, 1000),
        first: AtomicU32::new(u32::MAX),
        second: AtomicU32::new(u32::MAX),
    };

    // Handle the first 999 numbers specially as the number of digits varies.
    for n in 1..1000 {
        let (mut buffer, size) = format_string(&shared.prefix, n);
        check_hash(&mut buffer, size, n, &shared);
    }

    // Use as many cores as possible to parallelize the remaining search.
    spawn(|| worker(&shared));
    shared
}

pub fn part1(input: &Shared) -> u32 {
    input.first.load(Ordering::Relaxed)
}

pub fn part2(input: &Shared) -> u32 {
    input.second.load(Ordering::Relaxed)
}

fn format_string(prefix: &str, mut n: u32) -> ([u8; 64], usize) {
    let prefix_len = prefix.len();
    let digits = n.max(1).ilog10() as usize + 1;
    let size = prefix_len + digits;

    let mut buffer = [0; 64];
    buffer[..prefix_len].copy_from_slice(prefix.as_bytes());

    for i in (prefix_len..size).rev() {
        buffer[i] = b'0' + (n % 10) as u8;
        n /= 10;
    }

    (buffer, size)
}

fn check_hash(buffer: &mut [u8], size: usize, n: u32, shared: &Shared) {
    let [result, ..] = hash(buffer, size);

    if result & 0xffffff00 == 0 {
        shared.second.fetch_min(n, Ordering::Relaxed);
        shared.iter.stop();
    } else if result & 0xfffff000 == 0 {
        shared.first.fetch_min(n, Ordering::Relaxed);
    }
}

#[cfg(not(feature = "simd"))]
mod implementation {
    use super::*;

    pub(super) fn worker(shared: &Shared) {
        while let Some(offset) = shared.iter.next() {
            let (mut buffer, size) = format_string(&shared.prefix, offset);

            for n in 0..1000 {
                // Format macro is very slow, so update digits directly.
                buffer[size - 3] = b'0' + (n / 100) as u8;
                buffer[size - 2] = b'0' + ((n / 10) % 10) as u8;
                buffer[size - 1] = b'0' + (n % 10) as u8;

                check_hash(&mut buffer, size, offset + n, shared);
            }
        }
    }
}

#[cfg(feature = "simd")]
mod implementation {
    use super::*;
    use crate::util::bitset::*;
    use crate::util::md5::simd::hash_fixed;
    use std::simd::cmp::SimdPartialEq as _;
    use std::simd::*;

    #[expect(clippy::needless_range_loop)]
    fn check_hash_simd<const N: usize>(
        buffers: &mut [[u8; 64]; N],
        size: usize,
        start: u32,
        offset: u32,
        shared: &Shared,
    ) {
        // Format macro is very slow, so update digits directly.
        for i in 0..N {
            let n = offset + i as u32;
            buffers[i][size - 3] = b'0' + (n / 100) as u8;
            buffers[i][size - 2] = b'0' + ((n / 10) % 10) as u8;
            buffers[i][size - 1] = b'0' + (n % 10) as u8;
        }

        let [result, ..] = hash_fixed(buffers, size);
        let bitmask = (result & Simd::splat(0xfffff000)).simd_eq(Simd::splat(0)).to_bitmask();

        for i in bitmask.biterator() {
            if result[i] & 0xffffff00 == 0 {
                shared.second.fetch_min(start + offset + i as u32, Ordering::Relaxed);
                shared.iter.stop();
            } else {
                shared.first.fetch_min(start + offset + i as u32, Ordering::Relaxed);
            }
        }
    }

    pub(super) fn worker(shared: &Shared) {
        while let Some(start) = shared.iter.next() {
            let (prefix, size) = format_string(&shared.prefix, start);
            let buffers = &mut [prefix; 32];

            for offset in (0..992).step_by(32) {
                check_hash_simd(buffers, size, start, offset, shared);
            }

            let buffers = &mut [prefix; 8];
            check_hash_simd(buffers, size, start, 992, shared);
        }
    }
}