-
-
Save anonymous/904d9e4c90a1a199f452 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
running 7 tests | |
test bench_decode_utf8_replace_none_condition ... bench: 1554 ns/iter (+/- 70) | |
test bench_decode_utf8_replace_none_condition_no_amortized ... bench: 1974 ns/iter (+/- 267) | |
test bench_decode_utf8_replace_none_condition_nocatch ... bench: 1536 ns/iter (+/- 51) | |
test bench_decode_utf8_replace_none_default ... bench: 1460 ns/iter (+/- 50) | |
test bench_decode_utf8_replace_once_condition ... bench: 3423 ns/iter (+/- 449) | |
test bench_decode_utf8_replace_once_condition_no_amortized ... bench: 4041 ns/iter (+/- 160) | |
test bench_decode_utf8_replace_once_default ... bench: 1586 ns/iter (+/- 40) | |
test result: ok. 0 passed; 0 failed; 0 ignored; 7 measured | |
*/ | |
extern mod extra; | |
#[deriving(Clone)] | |
struct PutBack<A, T> { | |
top: Option<A>, | |
iter: T | |
} | |
impl<A, T> PutBack<A, T> { | |
#[inline] | |
pub fn new(it: T) -> PutBack<A, T> { | |
PutBack{top: None, iter: it} | |
} | |
#[inline] | |
pub fn put_back(&mut self, x: A) { | |
self.top = Some(x) | |
} | |
} | |
impl<A, T: Iterator<A>> Iterator<A> for PutBack<A, T> { | |
#[inline] | |
fn next(&mut self) -> Option<A> { | |
match self.top.take() { | |
None => self.iter.next(), | |
top => top, | |
} | |
} | |
#[inline] | |
fn size_hint(&self) -> (uint, Option<uint>) { | |
let (lo, hi) = self.iter.size_hint(); | |
if self.top.is_some() { | |
(lo + 1, hi.map(|a| *a + 1)) | |
} else { (lo, hi) } | |
} | |
} | |
/// . | |
pub enum DecoderError { | |
/// End of input with unfinished character | |
IncompleteInput, | |
///Invalid byte in codepoint encoding | |
InvalidSequence, | |
} | |
condition! { | |
pub decode_error: (super::DecoderError, ~[u8]) -> ~str; | |
} | |
/// Handle Decoder Error | |
pub trait DecoderHandler { | |
/// The DecoderHandler may fail, raise a condition, | |
/// or return a replacement string. | |
fn error_str(&self, msg: DecoderError, sequence: &[u8]) -> ~str; | |
} | |
/// Raise the condition `decode_error` when an error is encountered. | |
/// Decoding continues if the condition is resolved. | |
#[deriving(Clone)] | |
pub struct Strict; | |
/// Use the decoder's default Replacement Character | |
#[deriving(Clone)] | |
pub struct ReplaceDefault; | |
impl DecoderHandler for ReplaceDefault { | |
#[inline(always)] | |
fn error_str(&self, _: DecoderError, _: &[u8]) -> ~str { | |
~"\uFFFD" | |
} | |
} | |
impl DecoderHandler for Strict { | |
fn error_str(&self, msg: DecoderError, buf: &[u8]) -> ~str { | |
decode_error::cond.raise((msg, buf.to_owned())) | |
} | |
} | |
enum UTF8Expect { | |
Ct, // Expecting 80 .. BF (one continuation byte) | |
E0, // Expecting A0 .. BF then one continuation byte | |
Et, // Expecting two continuation bytes | |
ED, // Expecting 80 .. 9F then one continuation byte | |
F0, // Expecting 90 .. BF then two continuation bytes | |
Ft, // Expecting three continuation bytes | |
F4, // Expecting 80 .. 8F then two continuation bytes | |
} | |
fn utf8_decode<E: DecoderHandler>(data: &[u8], err: E) -> ~str { | |
use std::str; | |
let mut res = std::vec::with_capacity(data.len()); | |
let mut it = PutBack::new(data.iter().map(|&x| x).enumerate()); | |
let mut valid = 0u; | |
let mut start = None; | |
let mut expt; | |
let extend_valid = |i, step| { | |
if start.is_none() { start = Some(i) } | |
valid += step; | |
}; | |
let flush = || { | |
match start { | |
Some(st) => res.push_all(data.slice(st, st + valid)), | |
_ => () | |
} | |
start = None; valid = 0 | |
}; | |
let replace = |msg, slc| { | |
let repl = err.error_str(msg, slc); | |
res.push_all(repl.as_bytes()); | |
}; | |
loop { | |
let mut ch_start; | |
match it.next() { | |
None => break, | |
Some((i, b)) => { | |
match b { | |
0x00 .. 0x7F => { extend_valid(i, 1); loop }, | |
0xC2 .. 0xDF => expt = Ct, | |
0xE0 => expt = E0, | |
0xE1 .. 0xEC | | |
0xEE .. 0xEF => expt = Et, | |
0xED => expt = ED, | |
0xF0 => expt = F0, | |
0xF1 .. 0xF3 => expt = Ft, | |
0xF4 => expt = F4, | |
_ => { | |
flush(); | |
replace(InvalidSequence, &[b]); | |
loop | |
} | |
} | |
ch_start = i; | |
} | |
} | |
loop { | |
let (i, b) = match it.next() { | |
None => { | |
// Error: incomplete stream | |
flush(); | |
replace(IncompleteInput, data.slice_from(ch_start)); | |
return unsafe { str::raw::from_utf8_owned(res) } | |
}, | |
Some(x) => x, | |
}; | |
match (expt, b) { | |
(E0, 0xA0 .. 0xBF) | | |
(Et, 0x80 .. 0xBF) | | |
(ED, 0x80 .. 0x9F) => expt = Ct, | |
(F0, 0x90 .. 0xBF) | | |
(Ft, 0x80 .. 0xBF) | | |
(F4, 0x80 .. 0x8F) => expt = Et, | |
(Ct, 0x80 .. 0xBF) => { | |
extend_valid(ch_start, i + 1 - ch_start); | |
break | |
}, | |
_ => { | |
// Error: Invalid continuation byte | |
flush(); | |
replace(InvalidSequence, data.slice(ch_start, i)); | |
/* To be UTF-8 conformant we MUST not skip over valid start bytes here */ | |
it.put_back((i, b)); | |
break | |
}, | |
} | |
} | |
} | |
flush(); | |
unsafe { str::raw::from_utf8_owned(res) } | |
} | |
static ReplacementChar: char = '\ufffd'; | |
static ReplacementCharStr: &'static str = "\uFFFD"; | |
#[inline(never)] | |
fn used<T>(_: T) { } | |
static longer_text: [u8, ..210] = | |
[76, 105, 110, 103, 117, 105, 115, 116, 105, 99, 115, 32, 97, 110, 100, 32, 100, 105, 99, 116, 105, 111, 110, 97, 114, 105, 101, 115, 58, 10, 10, 32, 32, 195, 176, 105, 32, 196, 177, 110, 116, 201, 153, 203, 136, 110, 195, 166, 202, 131, 201, 153, 110, 201, 153, 108, 32, 102, 201, 153, 203, 136, 110, 201, 155, 116, 196, 177, 107, 32, 201, 153, 115, 111, 202, 138, 115, 105, 203, 136, 101, 196, 177, 202, 131, 110, 10, 32, 32, 89, 32, 91, 203, 136, 202, 143, 112, 115, 105, 108, 201, 148, 110, 93, 44, 32, 89, 101, 110, 32, 91, 106, 201, 155, 110, 93, 44, 32, 89, 111, 103, 97, 32, 91, 203, 136, 106, 111, 203, 144, 103, 201, 145, 93, 10, 10, 65, 80, 76, 58, 10, 10, 32, 32, 40, 40, 86, 226, 141, 179, 86, 41, 61, 226, 141, 179, 226, 141, 180, 86, 41, 47, 86, 226, 134, 144, 44, 86, 32, 32, 32, 32, 226, 140, 183, 226, 134, 144, 226, 141, 179, 226, 134, 146, 226, 141, 180, 226, 136, 134, 226, 136, 135, 226, 138, 131, 226, 128, 190, 226, 141, 142, 226, 141, 149, 226, 140, 136, 10, 10]; | |
static longer_text_one_error: [u8, ..210] = | |
[76, 105, 110, 103, 117, 105, 115, 116, 105, 99, 115, 32, 97, 110, 100, 32, 100, 105, 99, 116, 105, 111, 110, 97, 114, 105, 101, 115, 58, 10, 10, 32, 32, 195, 176, 105, 32, 196, 177, 110, 116, 201, 153, 203, 136, 110, 195, 166, 202, 131, 201, 153, 110, 201, 153, 108, 32, 102, 201, 153, 203, 136, 110, 201, 155, 116, 196, 177, 107, 32, 201, 153, 115, 111, 202, 138, 115, 105, 203, 136, 101, 196, 177, 202, 131, 110, 10, 32, 32, 89, 32, 91, 203, 136, 202, 143, 112, 115, 105, 108, 201, 148, 110, 93, 44, 32, 89, 101, 110, 32, 91, 106, 201, 155, 110, 93, 255, 32, 89, 111, 103, 97, 32, 91, 203, 136, 106, 111, 203, 144, 103, 201, 145, 93, 10, 10, 65, 80, 76, 58, 10, 10, 32, 32, 40, 40, 86, 226, 141, 179, 86, 41, 61, 226, 141, 179, 226, 141, 180, 86, 41, 47, 86, 226, 134, 144, 44, 86, 32, 32, 32, 32, 226, 140, 183, 226, 134, 144, 226, 141, 179, 226, 134, 146, 226, 141, 180, 226, 136, 134, 226, 136, 135, 226, 138, 131, 226, 128, 190, 226, 141, 142, 226, 141, 149, 226, 140, 136, 10, 10]; | |
#[bench] | |
fn bench_decode_utf8_replace_none_default(b: &mut extra::test::BenchHarness) { | |
do b.iter { | |
used(utf8_decode(longer_text, ReplaceDefault)); | |
} | |
} | |
#[bench] | |
fn bench_decode_utf8_replace_none_condition(b: &mut extra::test::BenchHarness) { | |
do decode_error::cond.trap(|_| ReplacementCharStr.to_owned()).inside { | |
do b.iter { | |
used(utf8_decode(longer_text, Strict)); | |
} | |
} | |
} | |
#[bench] | |
fn bench_decode_utf8_replace_none_condition_nocatch(b: &mut extra::test::BenchHarness) { | |
do b.iter { | |
used(utf8_decode(longer_text, Strict)); | |
} | |
} | |
// Set up the condition handler for each iteration | |
#[bench] | |
fn bench_decode_utf8_replace_none_condition_no_amortized(b: &mut extra::test::BenchHarness) { | |
do b.iter { | |
do decode_error::cond.trap(|_| ReplacementCharStr.to_owned()).inside { | |
used(utf8_decode(longer_text, Strict)); | |
} | |
} | |
} | |
#[bench] | |
fn bench_decode_utf8_replace_once_default(b: &mut extra::test::BenchHarness) { | |
do b.iter { | |
used(utf8_decode(longer_text_one_error, ReplaceDefault)); | |
} | |
} | |
#[bench] | |
fn bench_decode_utf8_replace_once_condition(b: &mut extra::test::BenchHarness) { | |
do decode_error::cond.trap(|_| ReplacementCharStr.to_owned()).inside { | |
do b.iter { | |
used(utf8_decode(longer_text_one_error, Strict)); | |
} | |
} | |
} | |
// Set up the condition handler for each iteration | |
#[bench] | |
fn bench_decode_utf8_replace_once_condition_no_amortized(b: &mut extra::test::BenchHarness) { | |
do b.iter { | |
do decode_error::cond.trap(|_| ReplacementCharStr.to_owned()).inside { | |
used(utf8_decode(longer_text_one_error, Strict)); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment