polars_arrow/legacy/kernels/
sort_partition.rs

1use std::fmt::Debug;
2
3use polars_utils::IdxSize;
4use polars_utils::itertools::Itertools;
5use polars_utils::total_ord::TotalEq;
6
7use crate::types::NativeType;
8
9/// Find partition indexes such that every partition contains unique groups.
10fn find_partition_points<T>(values: &[T], n: usize, descending: bool) -> Vec<usize>
11where
12    T: Debug + NativeType,
13{
14    let len = values.len();
15    if n > len {
16        return find_partition_points(values, len / 2, descending);
17    }
18    if n < 2 {
19        return vec![];
20    }
21    let chunk_size = len / n;
22
23    let mut partition_points = Vec::with_capacity(n + 1);
24
25    let mut start_idx = 0;
26    loop {
27        let end_idx = start_idx + chunk_size;
28        if end_idx >= len {
29            break;
30        }
31        // first take that partition as a slice
32        // and then find the location where the group of the latest value starts
33        let part = &values[start_idx..end_idx];
34
35        let latest_val = values[end_idx];
36        let idx = if descending {
37            part.partition_point(|v| v.tot_gt(&latest_val))
38        } else {
39            part.partition_point(|v| v.tot_lt(&latest_val))
40        };
41
42        if idx != 0 {
43            partition_points.push(idx + start_idx)
44        }
45
46        start_idx += chunk_size;
47    }
48    partition_points
49}
50
51pub fn create_clean_partitions<T>(values: &[T], n: usize, descending: bool) -> Vec<&[T]>
52where
53    T: Debug + NativeType,
54{
55    let part_idx = find_partition_points(values, n, descending);
56    let mut out = Vec::with_capacity(n + 1);
57
58    let mut start_idx = 0_usize;
59    for end_idx in part_idx {
60        if end_idx != start_idx {
61            out.push(&values[start_idx..end_idx]);
62            start_idx = end_idx;
63        }
64    }
65    let latest = &values[start_idx..];
66    if !latest.is_empty() {
67        out.push(latest)
68    }
69
70    out
71}
72
73pub fn partition_to_groups_amortized_varsize<T, I>(
74    values: I,
75    values_len: IdxSize,
76    first_group_offset: IdxSize,
77    nulls_first: bool,
78    offset: IdxSize,
79    out: &mut Vec<[IdxSize; 2]>,
80) where
81    T: Debug + TotalEq,
82    I: IntoIterator<Item = T>,
83{
84    let mut values = values.into_iter().enumerate_idx();
85    if let Some((i, mut first)) = values.next() {
86        out.clear();
87        if nulls_first && first_group_offset > 0 {
88            out.push([0, first_group_offset])
89        }
90
91        let mut first_idx = if nulls_first { first_group_offset } else { 0 } + offset;
92        let mut start = i;
93
94        for (i, val) in values {
95            // new group reached
96            if val.tot_ne(&first) {
97                let len = i - start;
98                start = i;
99                out.push([first_idx, len]);
100                first_idx += len;
101                first = val;
102            }
103        }
104        // add last group
105        if nulls_first {
106            out.push([first_idx, values_len + first_group_offset - first_idx]);
107        } else {
108            out.push([first_idx, values_len - (first_idx - offset)]);
109        }
110
111        if !nulls_first && first_group_offset > 0 {
112            out.push([values_len + offset, first_group_offset])
113        }
114    }
115}
116
117pub fn partition_to_groups_amortized<T>(
118    values: &[T],
119    first_group_offset: IdxSize,
120    nulls_first: bool,
121    offset: IdxSize,
122    out: &mut Vec<[IdxSize; 2]>,
123) where
124    T: Debug + TotalEq + Sized,
125{
126    if let Some(mut first) = values.first() {
127        out.clear();
128        if nulls_first && first_group_offset > 0 {
129            out.push([0, first_group_offset])
130        }
131
132        let mut first_idx = if nulls_first { first_group_offset } else { 0 } + offset;
133
134        for val in values {
135            // new group reached
136            if val.tot_ne(first) {
137                let val_ptr = val as *const T;
138                let first_ptr = first as *const T;
139
140                // SAFETY:
141                // all pointers suffice the invariants
142                let len = unsafe { val_ptr.offset_from(first_ptr) } as IdxSize;
143                out.push([first_idx, len]);
144                first_idx += len;
145                first = val;
146            }
147        }
148        // add last group
149        if nulls_first {
150            out.push([
151                first_idx,
152                values.len() as IdxSize + first_group_offset - first_idx,
153            ]);
154        } else {
155            out.push([first_idx, values.len() as IdxSize - (first_idx - offset)]);
156        }
157
158        if !nulls_first && first_group_offset > 0 {
159            out.push([values.len() as IdxSize + offset, first_group_offset])
160        }
161    }
162}
163
164/// Take a clean-partitioned slice and return the groups slices
165/// With clean-partitioned we mean that the slice contains all groups and are not spilled to another partition.
166///
167/// `first_group_offset` can be used to add insert the `null` values group.
168pub fn partition_to_groups<T>(
169    values: &[T],
170    first_group_offset: IdxSize,
171    nulls_first: bool,
172    offset: IdxSize,
173) -> Vec<[IdxSize; 2]>
174where
175    T: Debug + NativeType,
176{
177    if values.is_empty() {
178        return vec![];
179    }
180    let mut out = Vec::with_capacity(values.len() / 10);
181    partition_to_groups_amortized(values, first_group_offset, nulls_first, offset, &mut out);
182    out
183}
184
185#[cfg(test)]
186mod test {
187    use super::*;
188
189    #[test]
190    fn test_partition_points() {
191        let values = &[1, 3, 3, 3, 3, 5, 5, 5, 9, 9, 10];
192
193        assert_eq!(find_partition_points(values, 4, false), &[1, 5, 8, 10]);
194        assert_eq!(
195            partition_to_groups(values, 0, true, 0),
196            &[[0, 1], [1, 4], [5, 3], [8, 2], [10, 1]]
197        );
198        assert_eq!(
199            partition_to_groups(values, 5, true, 0),
200            &[[0, 5], [5, 1], [6, 4], [10, 3], [13, 2], [15, 1]]
201        );
202        assert_eq!(
203            partition_to_groups(values, 5, false, 0),
204            &[[0, 1], [1, 4], [5, 3], [8, 2], [10, 1], [11, 5]]
205        );
206    }
207}