floem_editor_core/
line_ending.rs

1use std::{iter::Peekable, ops::Range};
2
3use lapce_xi_rope::{DeltaBuilder, Rope, RopeDelta};
4use memchr::{memchr, memchr2};
5use std::sync::LazyLock;
6
7// Cached ropes for the two line endings
8static CR_LF: LazyLock<Rope> = LazyLock::new(|| Rope::from("\r\n"));
9static LF: LazyLock<Rope> = LazyLock::new(|| Rope::from("\n"));
10
11#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
12pub enum LineEnding {
13    /// `\r\n` Windows  
14    CrLf,
15    /// `\n` Unix
16    Lf,
17}
18impl LineEnding {
19    /// Replace the line endings (`\n`, `\r\n`, `\r`) used in `text` with the line ending named by
20    /// `self`.
21    pub fn normalize(self, text: &Rope) -> Rope {
22        self.normalize_delta(text)
23            .map(|d| d.apply(text))
24            .unwrap_or_else(|| text.clone())
25    }
26
27    pub fn normalize_delta(self, text: &Rope) -> Option<RopeDelta> {
28        let mut builder = DeltaBuilder::new(text.len());
29
30        let le = if self == LineEnding::Lf {
31            LF.clone()
32        } else {
33            CR_LF.clone()
34        };
35
36        let mut had_entries = false;
37        for (range, kind) in FullLeChunkSearch::new(text.iter_chunks(..)) {
38            had_entries = true;
39            match kind {
40                LeChunkKind::CrLf => {
41                    if self == LineEnding::Lf {
42                        builder.replace(range, LF.clone());
43                    }
44                }
45                LeChunkKind::Lf => {
46                    if self == LineEnding::CrLf {
47                        builder.replace(range, CR_LF.clone());
48                    }
49                }
50                LeChunkKind::Cr => {
51                    builder.replace(range, le.clone());
52                }
53            }
54        }
55
56        if had_entries {
57            let delta = builder.build();
58            Some(delta)
59        } else {
60            None
61        }
62    }
63
64    /// Only replace the carriage return line-endings.
65    pub fn normalize_limited(self, text: &Rope) -> Rope {
66        let mut builder = DeltaBuilder::new(text.len());
67
68        let le = if self == LineEnding::Lf {
69            LF.clone()
70        } else {
71            CR_LF.clone()
72        };
73
74        let mut had_entries = false;
75        for offset in LoneCrChunkSearch::new(text.iter_chunks(..)) {
76            had_entries = true;
77            builder.replace(offset..offset + 1, le.clone());
78        }
79
80        if had_entries {
81            let delta = builder.build();
82            delta.apply(text)
83        } else {
84            text.clone()
85        }
86    }
87
88    pub fn get_chars(&self) -> &'static str {
89        match self {
90            LineEnding::CrLf => "\r\n",
91            LineEnding::Lf => "\n",
92        }
93    }
94
95    /// Get the name of the line ending
96    pub fn as_str(&self) -> &'static str {
97        match self {
98            LineEnding::CrLf => "CRLF",
99            LineEnding::Lf => "LF",
100        }
101    }
102}
103
104#[derive(Debug, Clone, Copy)]
105pub enum LineEndingDetermination {
106    CrLf,
107    Lf,
108    Mixed,
109    Unknown,
110}
111impl LineEndingDetermination {
112    // TODO: should we just do a simpler routine of checking the first few lines?
113    // Based off of xi-rope's line-ending determination logic
114    pub fn determine(text: &Rope) -> Self {
115        let mut crlf = false;
116        let mut lf = false;
117
118        for chunk in text.iter_chunks(..) {
119            match LineEndingDetermination::determine_str(chunk) {
120                LineEndingDetermination::CrLf => crlf = true,
121                LineEndingDetermination::Lf => lf = true,
122                LineEndingDetermination::Mixed => {
123                    return LineEndingDetermination::Mixed;
124                }
125                LineEndingDetermination::Unknown => {}
126            }
127        }
128
129        match (crlf, lf) {
130            (true, true) => LineEndingDetermination::Mixed,
131            (true, false) => LineEndingDetermination::CrLf,
132            (false, true) => LineEndingDetermination::Lf,
133            (false, false) => LineEndingDetermination::Unknown,
134        }
135    }
136
137    fn determine_str(chunk: &str) -> LineEndingDetermination {
138        let bytes = chunk.as_bytes();
139        let newline = memchr2(b'\n', b'\r', bytes);
140        match newline {
141            Some(x) if bytes[x] == b'\r' && bytes.len() > x + 1 && bytes[x + 1] == b'\n' => {
142                LineEndingDetermination::CrLf
143            }
144            Some(x) if bytes[x] == b'\n' => LineEndingDetermination::Lf,
145            Some(_) => LineEndingDetermination::Mixed,
146            None => LineEndingDetermination::Unknown,
147        }
148    }
149
150    pub fn unwrap_or(self, le: LineEnding) -> LineEnding {
151        match self {
152            LineEndingDetermination::CrLf => LineEnding::CrLf,
153            LineEndingDetermination::Lf => LineEnding::Lf,
154            LineEndingDetermination::Mixed | LineEndingDetermination::Unknown => le,
155        }
156    }
157}
158
159#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
160enum LeChunkKind {
161    CrLf,
162    Lf,
163    Cr,
164}
165
166/// Line ending chunk searcher
167struct FullLeChunkSearch<'a, I: Iterator<Item = &'a str>> {
168    offset: usize,
169    /// Offset within the chunk itself
170    chunk_pos: usize,
171    chunks: Peekable<I>,
172}
173impl<'a, I: Iterator<Item = &'a str>> FullLeChunkSearch<'a, I> {
174    fn new(chunks: I) -> Self {
175        Self {
176            offset: 0,
177            chunk_pos: 0,
178            chunks: chunks.peekable(),
179        }
180    }
181
182    /// Get the current chunk, updating the current chunk if needed
183    fn get_chunk(&mut self) -> Option<&'a str> {
184        let chunk = self.chunks.peek()?;
185        if self.chunk_pos >= chunk.len() {
186            self.advance_chunk();
187            Some(*self.chunks.peek()?)
188        } else {
189            Some(chunk)
190        }
191    }
192
193    fn advance_chunk(&mut self) -> Option<()> {
194        let chunk = self.chunks.next()?;
195        self.offset += chunk.len();
196        self.chunk_pos = 0;
197
198        Some(())
199    }
200}
201impl<'a, I: Iterator<Item = &'a str>> Iterator for FullLeChunkSearch<'a, I> {
202    type Item = (Range<usize>, LeChunkKind);
203
204    fn next(&mut self) -> Option<Self::Item> {
205        let chunk = self.get_chunk()?;
206
207        let bytes = &chunk.as_bytes()[self.chunk_pos..];
208
209        let newline = memchr2(b'\n', b'\r', bytes);
210        match newline {
211            // CrLf
212            Some(x) if bytes[x] == b'\r' && bytes.len() > x + 1 && bytes[x + 1] == b'\n' => {
213                let start = self.offset + self.chunk_pos + x;
214                let end = start + 2;
215
216                self.chunk_pos += x + 2;
217                Some((start..end, LeChunkKind::CrLf))
218            }
219            // Lf
220            Some(x) if bytes[x] == b'\n' => {
221                let start = self.offset + self.chunk_pos + x;
222                let end = start + 1;
223
224                self.chunk_pos += x + 1;
225                Some((start..end, LeChunkKind::Lf))
226            }
227            Some(x) => {
228                // Typically this only occurs for a lone `\r`.
229                // However, we need to handle the case where the `\r` is the last character in the
230                // chunk whilst the next chunk starts with a `\n`.
231                assert_eq!(bytes[x], b'\r');
232
233                let start = self.offset + self.chunk_pos + x;
234                self.chunk_pos += x + 1;
235
236                let v = if self.chunk_pos == chunk.len() {
237                    if let Some(next_chunk) = self.get_chunk() {
238                        let next_chunk = &next_chunk.as_bytes()[self.chunk_pos..];
239                        if next_chunk.starts_with(b"\n") {
240                            self.chunk_pos += 1;
241                            Some((start..start + 2, LeChunkKind::CrLf))
242                        } else {
243                            None
244                        }
245                    } else {
246                        None
247                    }
248                } else {
249                    None
250                };
251
252                Some(v.unwrap_or_else(|| {
253                    // There is no \n so it is a lone `\r`
254                    // (Which is used in MacOS, or sometimes due to bugged line endings)
255                    let end = start + 1;
256                    (start..end, LeChunkKind::Cr)
257                }))
258            }
259            None => {
260                self.advance_chunk();
261                self.next()
262            }
263        }
264    }
265}
266
267/// Iterator that searches for lone carriage returns ('\r') in chunks of text.
268struct LoneCrChunkSearch<'a, I: Iterator<Item = &'a str>> {
269    /// Offset of the start of the current chunk
270    offset: usize,
271    chunk_pos: usize,
272    chunks: Peekable<I>,
273}
274
275impl<'a, I: Iterator<Item = &'a str>> LoneCrChunkSearch<'a, I> {
276    fn new(chunks: I) -> Self {
277        Self {
278            offset: 0,
279            chunk_pos: 0,
280            chunks: chunks.peekable(),
281        }
282    }
283
284    /// Get the current chunk, or if chunk pos is past the end of the chunk, then
285    /// advance to the next chunk and get it.
286    fn get_chunk(&mut self) -> Option<&'a str> {
287        let chunk = self.chunks.peek()?;
288        if self.chunk_pos >= chunk.len() {
289            self.advance_chunk();
290            Some(*self.chunks.peek()?)
291        } else {
292            Some(chunk)
293        }
294    }
295
296    fn advance_chunk(&mut self) -> Option<()> {
297        let chunk = self.chunks.next()?;
298        self.offset += chunk.len();
299        self.chunk_pos = 0;
300
301        Some(())
302    }
303}
304
305impl<'a, I: Iterator<Item = &'a str>> Iterator for LoneCrChunkSearch<'a, I> {
306    type Item = usize;
307
308    fn next(&mut self) -> Option<Self::Item> {
309        loop {
310            let chunk = self.get_chunk()?;
311
312            let bytes = &chunk.as_bytes()[self.chunk_pos..];
313
314            let newline = memchr(b'\r', bytes);
315            match newline {
316                Some(x) => {
317                    let offset = self.offset + self.chunk_pos + x;
318
319                    // Check if the next character is '\n' (indicating \r\n)
320                    self.chunk_pos += x + 1;
321                    if self.chunk_pos < chunk.len() && chunk.as_bytes()[self.chunk_pos] == b'\n' {
322                        // Skip \r\n sequences
323                        self.chunk_pos += 1;
324                    } else if let Some(chunk_b) = self.get_chunk() {
325                        let chunk_b = &chunk_b.as_bytes()[self.chunk_pos..];
326                        if chunk_b.starts_with(b"\n") {
327                            // Skip \r\n sequences across chunks
328                            self.chunk_pos += 1;
329                        } else {
330                            // Lone \r
331                            return Some(offset);
332                        }
333                    } else {
334                        // Lone \r at the end
335                        return Some(offset);
336                    }
337                }
338                None => {
339                    self.advance_chunk();
340                }
341            }
342        }
343    }
344}
345
346#[cfg(test)]
347mod tests {
348    use super::*;
349
350    #[test]
351    fn normalize() {
352        let text = Rope::from("hello\r\nworld toast and jam\nthe end\nhi");
353        let normalized = LineEnding::CrLf.normalize(&text);
354        assert_eq!(
355            normalized.slice_to_cow(..),
356            "hello\r\nworld toast and jam\r\nthe end\r\nhi"
357        );
358
359        let text = Rope::from("\n");
360        let normalized = LineEnding::Lf.normalize(&text);
361        assert_eq!(normalized.slice_to_cow(..), "\n");
362        let normalized = LineEnding::CrLf.normalize(&text);
363        assert_eq!(normalized.slice_to_cow(..), "\r\n");
364
365        let text = Rope::from("\r\n");
366        let normalized = LineEnding::Lf.normalize(&text);
367        assert_eq!(normalized.slice_to_cow(..), "\n");
368        let normalized = LineEnding::CrLf.normalize(&text);
369        assert_eq!(normalized.slice_to_cow(..), "\r\n");
370
371        // `\r` is always normalized to the line ending of the file
372        let text = Rope::from("\r");
373        let normalized = LineEnding::Lf.normalize(&text);
374        assert_eq!(normalized.slice_to_cow(..), "\n");
375        let normalized = LineEnding::CrLf.normalize(&text);
376        assert_eq!(normalized.slice_to_cow(..), "\r\n");
377        let normalized = LineEnding::Lf.normalize_limited(&text);
378        assert_eq!(normalized.slice_to_cow(..), "\n");
379
380        let text = Rope::from("\rtest");
381        let normalized = LineEnding::Lf.normalize(&text);
382        assert_eq!(normalized.slice_to_cow(..), "\ntest");
383        let normalized = LineEnding::CrLf.normalize(&text);
384        assert_eq!(normalized.slice_to_cow(..), "\r\ntest");
385        let normalized = LineEnding::Lf.normalize_limited(&text);
386        assert_eq!(normalized.slice_to_cow(..), "\ntest");
387    }
388
389    #[test]
390    fn chunk_search() {
391        let text = Rope::from("hello\r\nworld toast and jam\nthe end\nhi");
392        let c = FullLeChunkSearch::new(text.iter_chunks(..));
393        assert_eq!(
394            c.collect::<Vec<_>>(),
395            vec![
396                (5..7, LeChunkKind::CrLf),
397                (26..27, LeChunkKind::Lf),
398                (34..35, LeChunkKind::Lf),
399            ]
400        );
401        let c = LoneCrChunkSearch::new(text.iter_chunks(..));
402        assert_eq!(c.collect::<Vec<_>>(), Vec::new());
403
404        // Test searching across different chunks of text
405        // (Using a non-Rope iterator to simplify creation, however it should behave the same)
406        let text = ["a\n", "\n5", "\r\ne\r", "\ntest\r", "\rv"];
407        let multi_chunk = FullLeChunkSearch::new(text.into_iter());
408        assert_eq!(
409            multi_chunk.collect::<Vec<_>>(),
410            vec![
411                (1..2, LeChunkKind::Lf),
412                (2..3, LeChunkKind::Lf),
413                (4..6, LeChunkKind::CrLf),
414                (7..9, LeChunkKind::CrLf),
415                (13..14, LeChunkKind::Cr),
416                (14..15, LeChunkKind::Cr),
417            ]
418        );
419
420        let multi_chunk = LoneCrChunkSearch::new(text.into_iter());
421        assert_eq!(multi_chunk.collect::<Vec<_>>(), vec![13, 14]);
422
423        let text = ["\n\rb"];
424        let chunks = FullLeChunkSearch::new(text.into_iter());
425        assert_eq!(
426            chunks.collect::<Vec<_>>(),
427            vec![(0..1, LeChunkKind::Lf), (1..2, LeChunkKind::Cr)]
428        );
429
430        let text = ["\n\rb"];
431        let chunks = LoneCrChunkSearch::new(text.into_iter());
432        assert_eq!(chunks.collect::<Vec<_>>(), vec![1]);
433    }
434}