polars_utils/
plpath.rs

1use core::fmt;
2use std::path::{Component, Path};
3use std::str::FromStr;
4use std::sync::Arc;
5
6/// A Path or URI
7#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
8#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
9#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
10pub enum PlPath {
11    Local(Arc<Path>),
12    Cloud(PlCloudPath),
13}
14
15/// A reference to a Path or URI
16#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
17pub enum PlPathRef<'a> {
18    Local(&'a Path),
19    Cloud(PlCloudPathRef<'a>),
20}
21
22#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
23#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
24#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
25pub struct PlCloudPath {
26    /// The scheme used in cloud e.g. `s3://` or `file://`.
27    scheme: CloudScheme,
28    /// The full URI e.g. `s3://path/to/bucket`.
29    uri: Arc<str>,
30}
31
32#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
33pub struct PlCloudPathRef<'a> {
34    /// The scheme used in cloud e.g. `s3://` or `file://`.
35    scheme: CloudScheme,
36    /// The full URI e.g. `s3://path/to/bucket`.
37    uri: &'a str,
38}
39
40impl<'a> fmt::Display for PlCloudPathRef<'a> {
41    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
42        f.write_str(self.uri())
43    }
44}
45
46impl fmt::Display for PlCloudPath {
47    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
48        self.as_ref().fmt(f)
49    }
50}
51
52impl PlCloudPath {
53    pub fn as_ref(&self) -> PlCloudPathRef<'_> {
54        PlCloudPathRef {
55            scheme: self.scheme,
56            uri: self.uri.as_ref(),
57        }
58    }
59
60    pub fn strip_scheme(&self) -> &str {
61        &self.uri[self.scheme.as_str().len() + 3..]
62    }
63}
64
65impl PlCloudPathRef<'_> {
66    pub fn into_owned(self) -> PlCloudPath {
67        PlCloudPath {
68            scheme: self.scheme,
69            uri: self.uri.into(),
70        }
71    }
72
73    pub fn scheme(&self) -> CloudScheme {
74        self.scheme
75    }
76
77    pub fn uri(&self) -> &str {
78        self.uri
79    }
80
81    pub fn strip_scheme(&self) -> &str {
82        &self.uri[self.scheme.as_str().len() + "://".len()..]
83    }
84}
85
86pub struct AddressDisplay<'a> {
87    addr: PlPathRef<'a>,
88}
89
90impl<'a> fmt::Display for AddressDisplay<'a> {
91    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
92        match self.addr {
93            PlPathRef::Local(p) => p.display().fmt(f),
94            PlPathRef::Cloud(p) => p.fmt(f),
95        }
96    }
97}
98
99macro_rules! impl_scheme {
100    ($($t:ident = $n:literal,)+) => {
101        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
102        #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
103        #[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
104        pub enum CloudScheme {
105            $($t,)+
106        }
107
108        impl FromStr for CloudScheme {
109            type Err = ();
110
111            fn from_str(s: &str) -> Result<Self, Self::Err> {
112                match s {
113                    $($n => Ok(Self::$t),)+
114                    _ => Err(()),
115                }
116            }
117        }
118
119        impl CloudScheme {
120            pub fn as_str(&self) -> &'static str {
121                match self {
122                    $(Self::$t => $n,)+
123                }
124            }
125        }
126    };
127}
128
129impl_scheme! {
130    S3 = "s3",
131    S3a = "s3a",
132    Gs = "gs",
133    Gcs = "gcs",
134    File = "file",
135    Abfs = "abfs",
136    Abfss = "abfss",
137    Azure = "azure",
138    Az = "az",
139    Adl = "adl",
140    Http = "http",
141    Https = "https",
142    Hf = "hf",
143}
144
145impl fmt::Display for CloudScheme {
146    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
147        f.write_str(self.as_str())
148    }
149}
150
151crate::regex_cache::cached_regex! {
152    static CLOUD_SCHEME_REGEX = r"^(s3a?|gs|gcs|file|abfss?|azure|az|adl|https?|hf)$";
153}
154
155impl<'a> PlPathRef<'a> {
156    pub fn scheme(&self) -> Option<CloudScheme> {
157        match self {
158            Self::Local(_) => None,
159            Self::Cloud(p) => Some(p.scheme),
160        }
161    }
162
163    pub fn is_local(&self) -> bool {
164        matches!(self, Self::Local(_))
165    }
166
167    pub fn is_cloud_url(&self) -> bool {
168        matches!(self, Self::Cloud(_))
169    }
170
171    pub fn as_local_path(&self) -> Option<&Path> {
172        match self {
173            Self::Local(p) => Some(p),
174            Self::Cloud(_) => None,
175        }
176    }
177
178    pub fn as_cloud_addr(&self) -> Option<PlCloudPathRef<'_>> {
179        match self {
180            Self::Local(_) => None,
181            Self::Cloud(p) => Some(*p),
182        }
183    }
184
185    pub fn join(&self, other: impl AsRef<str>) -> PlPath {
186        let other = other.as_ref();
187        if other.is_empty() {
188            return self.into_owned();
189        }
190
191        match self {
192            Self::Local(p) => PlPath::Local(p.join(other).into()),
193            Self::Cloud(p) => {
194                let needs_slash = !p.uri.ends_with('/') && !other.starts_with('/');
195
196                let mut out =
197                    String::with_capacity(p.uri.len() + usize::from(needs_slash) + other.len());
198
199                out.push_str(p.uri);
200                if needs_slash {
201                    out.push('/');
202                }
203                // NOTE: This has as a consequence that pushing an absolute path into a URI
204                // just pushes the slashes while for a path it will make that absolute path the new
205                // path. I think this is acceptable as I don't really know what the alternative
206                // would be.
207                out.push_str(other);
208
209                let uri = out.into();
210                PlPath::Cloud(PlCloudPath {
211                    scheme: p.scheme,
212                    uri,
213                })
214            },
215        }
216    }
217
218    pub fn display(&self) -> AddressDisplay<'_> {
219        AddressDisplay { addr: *self }
220    }
221
222    pub fn from_local_path(path: &'a Path) -> Self {
223        Self::Local(path)
224    }
225
226    pub fn new(uri: &'a str) -> Self {
227        if let Some(i) = uri.find([':', '/']) {
228            if uri[i..].starts_with("://") && CLOUD_SCHEME_REGEX.is_match(&uri[..i]) {
229                let scheme = CloudScheme::from_str(&uri[..i]).unwrap();
230                return Self::Cloud(PlCloudPathRef { scheme, uri });
231            }
232        }
233
234        Self::from_local_path(Path::new(uri))
235    }
236
237    pub fn into_owned(self) -> PlPath {
238        match self {
239            Self::Local(p) => PlPath::Local(p.into()),
240            Self::Cloud(p) => PlPath::Cloud(p.into_owned()),
241        }
242    }
243
244    pub fn strip_scheme(&self) -> &str {
245        match self {
246            Self::Local(p) => p.to_str().unwrap(),
247            Self::Cloud(p) => p.strip_scheme(),
248        }
249    }
250
251    pub fn parent(&self) -> Option<Self> {
252        Some(match self {
253            Self::Local(p) => Self::Local(p.parent()?),
254            Self::Cloud(p) => {
255                let uri = p.uri;
256                let offset_start = p.scheme.as_str().len() + 3;
257                let last_slash = uri[offset_start..]
258                    .char_indices()
259                    .rev()
260                    .find(|(_, c)| *c == '/')?
261                    .0;
262                let uri = &uri[..offset_start + last_slash];
263
264                Self::Cloud(PlCloudPathRef {
265                    scheme: p.scheme,
266                    uri,
267                })
268            },
269        })
270    }
271
272    pub fn extension(&self) -> Option<&str> {
273        match self {
274            Self::Local(path) => path.extension().and_then(|e| e.to_str()),
275            Self::Cloud(_) => {
276                let offset_path = self.strip_scheme();
277                let separator = '/';
278
279                let mut ext_start = None;
280                for (i, c) in offset_path.char_indices() {
281                    if c == separator {
282                        ext_start = None;
283                    }
284
285                    if c == '.' && ext_start.is_none() {
286                        ext_start = Some(i);
287                    }
288                }
289
290                ext_start.map(|i| &offset_path[i + 1..])
291            },
292        }
293    }
294
295    pub fn to_str(&self) -> &'a str {
296        match self {
297            Self::Local(p) => p.to_str().unwrap(),
298            Self::Cloud(p) => p.uri,
299        }
300    }
301
302    // Panics: will panic if n is out of bounds, or the path cannot be parsed
303    pub fn offset_bytes(&'a self, n: usize) -> Self {
304        match self {
305            Self::Local(path) => {
306                let s = path.to_str().expect("Path is not valid UTF-8");
307                PlPathRef::Local(Path::new(&s[n..]))
308            },
309            Self::Cloud(cloudpath) => {
310                let s = self.to_str();
311                PlPathRef::Cloud(PlCloudPathRef {
312                    scheme: cloudpath.scheme,
313                    uri: &s[n..],
314                })
315            },
316        }
317    }
318
319    /// Return an iterator over the 'normal' components. This excludes any
320    /// prefix (e.g. 'C:\' or server share), directory (such as '.' and '..'),
321    /// or scheme, query or fragment data.
322    //
323    // For reference:
324    //   URI syntax = scheme ":" ["//" authority] path ["?" query] ["#" fragment]
325    //
326    // TODO: change to custom Enum Iterator if we care about performance
327    pub fn get_normal_components(&self) -> Box<dyn Iterator<Item = &str> + '_> {
328        match self {
329            Self::Local(path) => Box::new(path.components().filter_map(|c| match c {
330                Component::Normal(seg) => Some(seg.to_str().unwrap()),
331                _ => None,
332            })),
333            Self::Cloud(cloudpath) => {
334                let separator = '/';
335                let query_delimiter = '?';
336                let path = cloudpath.strip_scheme();
337                let path = path
338                    .split_once(query_delimiter)
339                    .map_or(path, |(before, _)| before);
340                Box::new(path.split(separator))
341            },
342        }
343    }
344}
345
346impl PlPath {
347    pub fn new(uri: &str) -> Self {
348        PlPathRef::new(uri).into_owned()
349    }
350
351    pub fn display(&self) -> AddressDisplay<'_> {
352        AddressDisplay {
353            addr: match self {
354                Self::Local(p) => PlPathRef::Local(p.as_ref()),
355                Self::Cloud(p) => PlPathRef::Cloud(p.as_ref()),
356            },
357        }
358    }
359
360    pub fn is_local(&self) -> bool {
361        self.as_ref().is_local()
362    }
363
364    pub fn is_cloud_url(&self) -> bool {
365        self.as_ref().is_cloud_url()
366    }
367
368    // We don't want FromStr since we are infallible.
369    #[expect(clippy::should_implement_trait)]
370    pub fn from_str(uri: &str) -> Self {
371        Self::new(uri)
372    }
373
374    pub fn from_string(uri: String) -> Self {
375        Self::new(&uri)
376    }
377
378    pub fn as_ref(&self) -> PlPathRef<'_> {
379        match self {
380            Self::Local(p) => PlPathRef::Local(p.as_ref()),
381            Self::Cloud(p) => PlPathRef::Cloud(p.as_ref()),
382        }
383    }
384
385    pub fn cloud_scheme(&self) -> Option<CloudScheme> {
386        match self {
387            Self::Local(_) => None,
388            Self::Cloud(p) => Some(p.scheme),
389        }
390    }
391
392    pub fn to_str(&self) -> &str {
393        match self {
394            Self::Local(p) => p.to_str().unwrap(),
395            Self::Cloud(p) => p.uri.as_ref(),
396        }
397    }
398
399    pub fn into_local_path(self) -> Option<Arc<Path>> {
400        match self {
401            PlPath::Local(path) => Some(path),
402            PlPath::Cloud(_) => None,
403        }
404    }
405}
406
407#[cfg(test)]
408mod tests {
409    use super::*;
410
411    #[test]
412    fn plpath_join() {
413        macro_rules! assert_plpath_join {
414            ($base:literal + $added:literal => $result:literal$(, $uri_result:literal)?) => {
415                // Normal path test
416                let path_base = $base.chars().map(|c| match c {
417                    '/' => std::path::MAIN_SEPARATOR,
418                    c => c,
419                }).collect::<String>();
420                let path_added = $added.chars().map(|c| match c {
421                    '/' => std::path::MAIN_SEPARATOR,
422                    c => c,
423                }).collect::<String>();
424                let path_result = $result.chars().map(|c| match c {
425                    '/' => std::path::MAIN_SEPARATOR,
426                    c => c,
427                }).collect::<String>();
428                assert_eq!(PlPath::new(&path_base).as_ref().join(path_added).to_str(), path_result);
429
430                // URI path test
431                let uri_base = format!("file://{}", $base);
432                #[allow(unused_variables)]
433                let result = {
434                    let x = $result;
435                    $(let x = $uri_result;)?
436                    x
437                };
438                let uri_result = format!("file://{result}");
439                assert_eq!(
440                    PlPath::new(uri_base.as_str())
441                        .as_ref()
442                        .join($added)
443                        .to_str(),
444                    uri_result.as_str()
445                );
446            };
447        }
448
449        assert_plpath_join!("a/b/c/" + "d/e" => "a/b/c/d/e");
450        assert_plpath_join!("a/b/c" + "d/e" => "a/b/c/d/e");
451        assert_plpath_join!("a/b/c" + "d/e/" => "a/b/c/d/e/");
452        assert_plpath_join!("a/b/c" + "" => "a/b/c");
453        assert_plpath_join!("a/b/c" + "/d" => "/d", "a/b/c/d");
454        assert_plpath_join!("a/b/c" + "/d/" => "/d/", "a/b/c/d/");
455        assert_plpath_join!("" + "/d/" => "/d/");
456        assert_plpath_join!("/" + "/d/" => "/d/", "//d/");
457        assert_plpath_join!("/x/y" + "/d/" => "/d/", "/x/y/d/");
458        assert_plpath_join!("/x/y" + "/d" => "/d", "/x/y/d");
459        assert_plpath_join!("/x/y" + "d" => "/x/y/d");
460
461        assert_plpath_join!("/a/longer" + "path" => "/a/longer/path");
462        assert_plpath_join!("/a/longer" + "/path" => "/path", "/a/longer/path");
463        assert_plpath_join!("/a/longer" + "path/wow" => "/a/longer/path/wow");
464        assert_plpath_join!("/a/longer" + "/path/wow" => "/path/wow", "/a/longer/path/wow");
465        assert_plpath_join!("/an/even/longer" + "path" => "/an/even/longer/path");
466        assert_plpath_join!("/an/even/longer" + "/path" => "/path", "/an/even/longer/path");
467        assert_plpath_join!("/an/even/longer" + "path/wow" => "/an/even/longer/path/wow");
468        assert_plpath_join!("/an/even/longer" + "/path/wow" => "/path/wow", "/an/even/longer/path/wow");
469    }
470}