From 027a42873975011393a893e4ce5c1d8a13456ac9 Mon Sep 17 00:00:00 2001 From: David Judd Date: Thu, 27 Dec 2018 09:31:38 -0800 Subject: [PATCH 1/2] More benchmarks --- benches/bench.rs | 36 +++++++++ benches/long.txt | 204 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 240 insertions(+) create mode 100644 benches/long.txt diff --git a/benches/bench.rs b/benches/bench.rs index 153012b..b06886b 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -3,6 +3,7 @@ extern crate unicode_normalization; extern crate test; +use std::fs; use test::Bencher; use unicode_normalization::UnicodeNormalization; @@ -80,6 +81,41 @@ fn bench_nfd_ascii(b: &mut Bencher) { b.iter(|| ASCII.nfd().count()); } +#[bench] +fn bench_nfc_long(b: &mut Bencher) { + let long = fs::read_to_string("benches/long.txt").unwrap(); + b.iter(|| long.nfc().count()); +} + +#[bench] +fn bench_nfd_long(b: &mut Bencher) { + let long = fs::read_to_string("benches/long.txt").unwrap(); + b.iter(|| long.nfd().count()); +} + +#[bench] +fn bench_nfkc_ascii(b: &mut Bencher) { + b.iter(|| ASCII.nfkc().count()); +} + +#[bench] +fn bench_nfkd_ascii(b: &mut Bencher) { + b.iter(|| ASCII.nfkd().count()); +} + +#[bench] +fn bench_nfkc_long(b: &mut Bencher) { + let long = fs::read_to_string("benches/long.txt").unwrap(); + b.iter(|| long.nfkc().count()); +} + + +#[bench] +fn bench_nfkd_long(b: &mut Bencher) { + let long = fs::read_to_string("benches/long.txt").unwrap(); + b.iter(|| long.nfkd().count()); +} + #[bench] fn bench_streamsafe_ascii(b: &mut Bencher) { b.iter(|| ASCII.stream_safe().count()); diff --git a/benches/long.txt b/benches/long.txt new file mode 100644 index 0000000..2b68a5c --- /dev/null +++ b/benches/long.txt @@ -0,0 +1,204 @@ +Original by Markus Kuhn, adapted for HTML by Martin Dürst. + +UTF-8 encoded sample plain-text file +‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾ + +Markus Kuhn [ˈmaʳkʊs kuːn] — 1999-08-20 + + +The ASCII compatible UTF-8 encoding of ISO 10646 and Unicode +plain-text files is defined in RFC 2279 and in ISO 10646-1 Annex R. + + +Using Unicode/UTF-8, you can write in emails and source code things such as + +Mathematics and Sciences: + + ∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i), ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β), + + ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ, ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (A ⇔ B), + + 2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm + +Linguistics and dictionaries: + + ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn + Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ] + +APL: + + ((V⍳V)=⍳⍴V)/V←,V ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈ + +Nicer typography in plain text files: + + ╔══════════════════════════════════════════╗ + ║ ║ + ║ • ‘single’ and “double” quotes ║ + ║ ║ + ║ • Curly apostrophes: “We’ve been here” ║ + ║ ║ + ║ • Latin-1 apostrophe and accents: '´` ║ + ║ ║ + ║ • ‚deutsche‘ „Anführungszeichen“ ║ + ║ ║ + ║ • †, ‡, ‰, •, 3–4, —, −5/+5, ™, … ║ + ║ ║ + ║ • ASCII safety test: 1lI|, 0OD, 8B ║ + ║ ╭─────────╮ ║ + ║ • the euro symbol: │ 14.95 € │ ║ + ║ ╰─────────╯ ║ + ╚══════════════════════════════════════════╝ + +Greek (in Polytonic): + + The Greek anthem: + + Σὲ γνωρίζω ἀπὸ τὴν κόψη + τοῦ σπαθιοῦ τὴν τρομερή, + σὲ γνωρίζω ἀπὸ τὴν ὄψη + ποὺ μὲ βία μετράει τὴ γῆ. + + ᾿Απ᾿ τὰ κόκκαλα βγαλμένη + τῶν ῾Ελλήνων τὰ ἱερά + καὶ σὰν πρῶτα ἀνδρειωμένη + χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά! + + From a speech of Demosthenes in the 4th century BC: + + Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι, + ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς + λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ + τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿ + εἰς τοῦτο προήκοντα, ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ + πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν + οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι, + οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν + ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον + τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι + γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν + προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους + σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ + τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ + τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς + τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον. + + Δημοσθένους, Γ´ ᾿Ολυνθιακὸς + +Georgian: + + From a Unicode conference invitation: + + გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო + კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს, + ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს + ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი, + ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება + ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში, + ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში. + +Russian: + + From a Unicode conference invitation: + + Зарегистрируйтесь сейчас на Десятую Международную Конференцию по + Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии. + Конференция соберет широкий круг экспертов по вопросам глобального + Интернета и Unicode, локализации и интернационализации, воплощению и + применению Unicode в различных операционных системах и программных + приложениях, шрифтах, верстке и многоязычных компьютерных системах. + +Thai (UCS Level 2): + + Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese + classic 'San Gua'): + + [----------------------------|------------------------] + ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช พระปกเกศกองบู๊กู้ขึ้นใหม่ + สิบสองกษัตริย์ก่อนหน้าแลถัดไป สององค์ไซร้โง่เขลาเบาปัญญา + ทรงนับถือขันทีเป็นที่พึ่ง บ้านเมืองจึงวิปริตเป็นนักหนา + โฮจิ๋นเรียกทัพทั่วหัวเมืองมา หมายจะฆ่ามดชั่วตัวสำคัญ + เหมือนขับไสไล่เสือจากเคหา รับหมาป่าเข้ามาเลยอาสัญ + ฝ่ายอ้องอุ้นยุแยกให้แตกกัน ใช้สาวนั้นเป็นชนวนชื่นชวนใจ + พลันลิฉุยกุยกีกลับก่อเหตุ ช่างอาเพศจริงหนาฟ้าร้องไห้ + ต้องรบราฆ่าฟันจนบรรลัย ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ + + (The above is a two-column text. If combining characters are handled + correctly, the lines of the second column should be aligned with the + | character above.) + +Ethiopian: + + Proverbs in the Amharic language: + + ሰማይ አይታረስ ንጉሥ አይከሰስ። + ብላ ካለኝ እንደአባቴ በቆመጠኝ። + ጌጥ ያለቤቱ ቁምጥና ነው። + ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው። + የአፍ ወለምታ በቅቤ አይታሽም። + አይጥ በበላ ዳዋ ተመታ። + ሲተረጉሙ ይደረግሙ። + ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል። + ድር ቢያብር አንበሳ ያስር። + ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም። + እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም። + የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ። + ሥራ ከመፍታት ልጄን ላፋታት። + ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል። + የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ። + ተንጋሎ ቢተፉ ተመልሶ ባፉ። + ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው። + እግርህን በፍራሽህ ልክ ዘርጋ። + +Runes: + + ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ + + (Old English, which transcribed into Latin reads 'He cwaeth that he + bude thaem lande northweardum with tha Westsae.' and means 'He said + that he lived in the northern land near the Western Sea.') + +Braille: + + ⡌⠁⠧⠑ ⠼⠁⠒ ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌ + + ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞ + ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎ + ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂ + ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙ + ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑ + ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲ + + ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ + + ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹ + ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞ + ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕ + ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹ + ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎ + ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎ + ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳ + ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞ + ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ + + (The first couple of paragraphs of "A Christmas Carol" by Dickens) + +Compact font selection example text: + + ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789 + abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ + –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд + ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა + +Greetings in various languages: + + Hello world, Καλημέρα κόσμε, コンニチハ + +Box drawing alignment tests: █ + ▉ + ╔══╦══╗ ┌──┬──┐ ╭──┬──╮ ╭──┬──╮ ┏━━┳━━┓ ┎┒┏┑ ╷ ╻ ┏┯┓ ┌┰┐ ▊ ╱╲╱╲╳╳╳ + ║┌─╨─┐║ │╔═╧═╗│ │╒═╪═╕│ │╓─╁─╖│ ┃┌─╂─┐┃ ┗╃╄┙ ╶┼╴╺╋╸┠┼┨ ┝╋┥ ▋ ╲╱╲╱╳╳╳ + ║│╲ ╱│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╿ │┃ ┍╅╆┓ ╵ ╹ ┗┷┛ └┸┘ ▌ ╱╲╱╲╳╳╳ + ╠╡ ╳ ╞╣ ├╢ ╟┤ ├┼─┼─┼┤ ├╫─╂─╫┤ ┣┿╾┼╼┿┫ ┕┛┖┚ ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳ + ║│╱ ╲│║ │║ ║│ ││ │ ││ │║ ┃ ║│ ┃│ ╽ │┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▎ + ║└─╥─┘║ │╚═╤═╝│ │╘═╪═╛│ │╙─╀─╜│ ┃└─╂─┘┃ ░░▒▒▓▓██ ┊ ┆ ╎ ╏ ┇ ┋ ▏ + ╚══╩══╝ └──┴──┘ ╰──┴──╯ ╰──┴──╯ ┗━━┻━━┛ └╌╌┘ ╎ ┗╍╍┛ ┋ ▁▂▃▄▅▆▇█ From 6eb6734b1cc51d654a3bcd49aaeb3199d9b471fc Mon Sep 17 00:00:00 2001 From: David Judd Date: Sat, 29 Dec 2018 13:56:01 -0800 Subject: [PATCH 2/2] Replace Vec/VecDeque with normally non-allocating buffer --- benches/bench.rs | 1 - src/buffer.rs | 267 +++++++++++++++++++++++++++++++++++++++++++++++ src/decompose.rs | 47 +++++---- src/lib.rs | 1 + src/recompose.rs | 8 +- 5 files changed, 300 insertions(+), 24 deletions(-) create mode 100644 src/buffer.rs diff --git a/benches/bench.rs b/benches/bench.rs index b06886b..b3ea836 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -109,7 +109,6 @@ fn bench_nfkc_long(b: &mut Bencher) { b.iter(|| long.nfkc().count()); } - #[bench] fn bench_nfkd_long(b: &mut Bencher) { let long = fs::read_to_string("benches/long.txt").unwrap(); diff --git a/src/buffer.rs b/src/buffer.rs new file mode 100644 index 0000000..c62683e --- /dev/null +++ b/src/buffer.rs @@ -0,0 +1,267 @@ +// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// There's no particular science behind this number, but it seems to work +// relatively well in ad-hoc benchmarks on a 2012 Macbook Pro. +const INLINE_MAX: usize = 4; + +// A struct optimized for FIFO queuing of very small amounts of data, supporting +// `push_back` and `pop_front` operations as well as a `mutate_in_place` +// operation used in practice for sorting. +// +// Stores the first INLINE_MAX entries inline, and then spills to a Vec; +// will only provide any advantage if spilling is unlikely. +// +// The implementation is a sort of compromise between a ring buffer and a plain +// array. We increment a `front` pointer on `pop_front` rather than move +// everything, but we don't use a true ring buffer because that would make +// sorting pretty complex. Instead we just compact `front` back to zero +// periodically. (Since sorting is only used for decomposition, not +// recomposition, this means we're more optimized for the former currently.) +#[derive(Clone)] +pub struct Buffer { + back: usize, + front: usize, + data: [T; INLINE_MAX], + spill: Vec, +} + +impl Buffer { + #[inline] + pub fn new() -> Buffer { + Buffer { + // Inline storage, used for fast path. + data: [T::default(); INLINE_MAX], + + // Logical extension of inline storage. If the number of items is, + // and has always been, <= INLINE_MAX, `spill` is empty. + // Otherwise, `spill` contains all items, and `data` contains a copy + // of between 1 and INLINE_MAX of them (where values before `front` + // are garbage in both `spill` and `data`). + // + // (We need the redundancy in the spilled case so that we can make a + // single slice available for sorting). + spill: Vec::new(), + + // Index of first item in buffer (or zero if buffer is empty). + // Always found in `data`. + front: 0, + + // Index where next item will be inserted in buffer, either in `data` + // or in `spill`. + back: 0, + } + } + + #[inline] + pub fn mutate_in_place(&mut self, mut f: F) + where + F: FnMut(&mut [T]), + { + if self.back <= INLINE_MAX { + // Not spilled (fast path) + f(&mut self.data[self.front..self.back]); + } else if self.size() <= INLINE_MAX { + // Spilled, can unspill (might as well since it's a copy either way) + self.unspill(); + f(&mut self.data[0..self.back]); + } else { + // Spilled, have to stay that way + f(&mut self.spill[self.front..self.back]); + self.data + .copy_from_slice(&self.spill[self.front..self.front + INLINE_MAX]); + } + } + + pub fn push_back(&mut self, item: T) { + if self.back < INLINE_MAX { + // Fast path + self.data[self.back] = item; + } else if self.size() < INLINE_MAX && self.back == INLINE_MAX { + // Non-allocating fallback + self.shrink_inline(); + self.data[self.back] = item; + } else if self.back == INLINE_MAX { + // Allocating fallback + self.spill(item); + } else { + // Already spilled + self.spill.push(item); + } + + self.back += 1; + } + + #[inline] + pub fn mutate_and_push_back(&mut self, item: T, mut f: F) + where + F: FnMut(&mut [T]), + { + if self.back < INLINE_MAX { + // Fast path + f(&mut self.data[self.front..self.back]); + self.data[self.back] = item; + } else if self.size() < INLINE_MAX && self.back == INLINE_MAX { + // Non-allocating fallback + self.shrink_inline(); + f(&mut self.data[0..self.back]); + self.data[self.back] = item; + } else if self.size() < INLINE_MAX { + // Spilled, can unspill (might as well since it's a copy either way) + self.unspill(); + f(&mut self.data[0..self.back]); + self.data[self.back] = item; + } else if self.back == INLINE_MAX { + // Allocating fallback + f(&mut self.data); + self.spill(item); + } else { + // Already spilled + f(&mut self.spill[self.front..self.back]); + self.data + .copy_from_slice(&self.spill[self.front..self.front + INLINE_MAX]); + self.spill.push(item); + } + + self.back += 1; + } + + #[inline] + pub fn pop_front(&mut self) -> Option { + match self.size() { + 0 => None, + 1 => { + // Fast path when the buffer has exactly one item - avoid + // moving the front pointer forward to avoid unnecessary + // compaction + self.back = self.front; + Some(self.data[self.front]) + } + _ => { + // Potentially slower path, if we need to compact + if self.front == INLINE_MAX - 1 { + self.compact(); + } + let result = self.data[self.front]; + self.front += 1; + Some(result) + } + } + } + + #[inline] + fn spill(&mut self, item: T) { + debug_assert!(self.back == INLINE_MAX && self.size() == INLINE_MAX); + self.spill.clear(); + self.spill.reserve(INLINE_MAX + 1); + self.spill.extend_from_slice(&self.data); + self.spill.push(item); + } + + #[inline] + fn compact(&mut self) { + debug_assert!(self.front > 0 && self.back >= INLINE_MAX); + if self.back == INLINE_MAX { + self.shrink_inline(); + } else if self.size() <= INLINE_MAX { + self.unspill(); + } else { + self.shrink_spilled(); + } + } + + #[inline] + fn shrink_inline(&mut self) { + debug_assert!(self.back <= INLINE_MAX && self.size() <= INLINE_MAX); + for i in self.front..self.back { + self.data[i - self.front] = self.data[i]; + } + self.back -= self.front; + self.front = 0; + } + + #[inline] + fn unspill(&mut self) { + debug_assert!(self.back > INLINE_MAX && self.size() <= INLINE_MAX); + let size = self.size(); + self.data[0..size].copy_from_slice(&self.spill[self.front..self.back]); + self.spill.clear(); + self.back = size; + self.front = 0; + } + + #[inline] + fn shrink_spilled(&mut self) { + debug_assert!(self.back > INLINE_MAX && self.size() > INLINE_MAX); + self.data + .copy_from_slice(&self.spill[self.front..self.front + INLINE_MAX]); + self.spill.drain(0..self.front); + self.back -= self.front; + self.front = 0; + } + + #[inline] + fn size(&self) -> usize { + self.back - self.front + } +} + +#[cfg(test)] +mod tests { + use super::{Buffer, INLINE_MAX}; + + #[test] + fn test_buffer_is_fifo() { + let mut buffer: Buffer = Buffer::new(); + assert_eq!(None, buffer.pop_front()); + for i in 0..INLINE_MAX * 3 { + buffer.push_back(i); + } + for i in 0..INLINE_MAX * 3 { + assert_eq!(Some(i), buffer.pop_front()); + } + assert_eq!(None, buffer.pop_front()); + } + + #[test] + fn test_buffer_sort_in_place() { + let mut buffer: Buffer = Buffer::new(); + for i in 0..INLINE_MAX * 3 { + for j in 0..i { + buffer.push_back(i - j - 1); + } + buffer.mutate_in_place(|data| data.sort()); + for j in 0..i { + assert_eq!(Some(j), buffer.pop_front()); + } + } + } + + #[test] + fn test_buffer_sort_before_push() { + let mut buffer: Buffer = Buffer::new(); + for i in 0..INLINE_MAX * 3 { + let inputs = (0..i).map(|j| i - j - 1).collect::>(); + + for j in &*inputs { + buffer.mutate_and_push_back(*j, |data| data.sort()); + } + let results = (0..i) + .map(|_| buffer.pop_front().unwrap()) + .collect::>(); + + let mut expected = inputs.clone(); + if i > 1 { + expected[0..i - 1].sort(); + } + assert_eq!(expected, results); + } + } +} diff --git a/src/decompose.rs b/src/decompose.rs index eef43bd..cdbf5f4 100644 --- a/src/decompose.rs +++ b/src/decompose.rs @@ -8,6 +8,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. use std::fmt::{self, Write}; +use buffer::Buffer; #[derive(Clone)] enum DecompositionType { @@ -20,7 +21,6 @@ enum DecompositionType { pub struct Decompositions { kind: DecompositionType, iter: I, - done: bool, // This buffer stores pairs of (canonical combining class, character), // pushed onto the end in text order. @@ -29,7 +29,7 @@ pub struct Decompositions { // `ready` pairs are sorted and ready to emit on demand. The "pending" // suffix afterwards still needs more characters for us to be able to sort // in canonical order and is not safe to emit. - buffer: Vec<(u8, char)>, + buffer: Buffer<(u8, char)>, ready: usize, } @@ -38,8 +38,7 @@ pub fn new_canonical>(iter: I) -> Decompositions { Decompositions { kind: self::DecompositionType::Canonical, iter: iter, - done: false, - buffer: Vec::new(), + buffer: Buffer::new(), ready: 0, } } @@ -49,32 +48,41 @@ pub fn new_compatible>(iter: I) -> Decompositions { Decompositions { kind: self::DecompositionType::Compatible, iter: iter, - done: false, - buffer: Vec::new(), + buffer: Buffer::new(), ready: 0, } } +#[inline] +fn sort_by_combining_class(data: &mut [(u8, char)]) { + // NB: `sort_by_key` is stable, so it will preserve the original text's + // order within a combining class. + data.sort_by_key(|k| k.0); +} + impl Decompositions { #[inline] fn push_back(&mut self, ch: char) { let class = super::char::canonical_combining_class(ch); + if class == 0 { - self.sort_pending(); + let ready = &mut self.ready; + self.buffer.mutate_and_push_back((class, ch), |prior_data| { + sort_by_combining_class(&mut prior_data[*ready..]); + *ready = prior_data.len(); + }); + } else { + self.buffer.push_back((class, ch)); } - self.buffer.push((class, ch)); } #[inline] fn sort_pending(&mut self) { - if self.ready == 0 && self.buffer.is_empty() { - return; - } - - // NB: `sort_by_key` is stable, so it will preserve the original text's - // order within a combining class. - self.buffer[self.ready..].sort_by_key(|k| k.0); - self.ready = self.buffer.len(); + let ready = &mut self.ready; + self.buffer.mutate_in_place(|data| { + sort_by_combining_class(&mut data[*ready..]); + *ready = data.len(); + }); } #[inline] @@ -83,7 +91,7 @@ impl Decompositions { None } else { self.ready -= 1; - Some(self.buffer.remove(0).1) + Some(self.buffer.pop_front().unwrap().1) } } } @@ -93,7 +101,7 @@ impl> Iterator for Decompositions { #[inline] fn next(&mut self) -> Option { - while self.ready == 0 && !self.done { + while self.ready == 0 { match (self.iter.next(), &self.kind) { (Some(ch), &DecompositionType::Canonical) => { super::char::decompose_canonical(ch, |d| self.push_back(d)); @@ -103,10 +111,11 @@ impl> Iterator for Decompositions { }, (None, _) => { self.sort_pending(); - self.done = true; + break; }, } } + self.pop_front() } diff --git a/src/lib.rs b/src/lib.rs index 1f19df2..8e0fd39 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -64,6 +64,7 @@ mod recompose; mod quick_check; mod stream_safe; mod tables; +mod buffer; #[cfg(test)] mod test; diff --git a/src/recompose.rs b/src/recompose.rs index 76740f0..e3a33d2 100644 --- a/src/recompose.rs +++ b/src/recompose.rs @@ -8,9 +8,9 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use std::collections::VecDeque; use std::fmt::{self, Write}; use decompose::Decompositions; +use buffer::Buffer; #[derive(Clone)] enum RecompositionState { @@ -24,7 +24,7 @@ enum RecompositionState { pub struct Recompositions { iter: Decompositions, state: RecompositionState, - buffer: VecDeque, + buffer: Buffer, composee: Option, last_ccc: Option } @@ -34,7 +34,7 @@ pub fn new_canonical>(iter: I) -> Recompositions { Recompositions { iter: super::decompose::new_canonical(iter), state: self::RecompositionState::Composing, - buffer: VecDeque::new(), + buffer: Buffer::new(), composee: None, last_ccc: None, } @@ -45,7 +45,7 @@ pub fn new_compatible>(iter: I) -> Recompositions { Recompositions { iter: super::decompose::new_compatible(iter), state: self::RecompositionState::Composing, - buffer: VecDeque::new(), + buffer: Buffer::new(), composee: None, last_ccc: None, }