Skip to content

Instantly share code, notes, and snippets.

Created September 9, 2013 02:25
Show Gist options
  • Save anonymous/904d9e4c90a1a199f452 to your computer and use it in GitHub Desktop.
Save anonymous/904d9e4c90a1a199f452 to your computer and use it in GitHub Desktop.
/*
running 7 tests
test bench_decode_utf8_replace_none_condition ... bench: 1554 ns/iter (+/- 70)
test bench_decode_utf8_replace_none_condition_no_amortized ... bench: 1974 ns/iter (+/- 267)
test bench_decode_utf8_replace_none_condition_nocatch ... bench: 1536 ns/iter (+/- 51)
test bench_decode_utf8_replace_none_default ... bench: 1460 ns/iter (+/- 50)
test bench_decode_utf8_replace_once_condition ... bench: 3423 ns/iter (+/- 449)
test bench_decode_utf8_replace_once_condition_no_amortized ... bench: 4041 ns/iter (+/- 160)
test bench_decode_utf8_replace_once_default ... bench: 1586 ns/iter (+/- 40)
test result: ok. 0 passed; 0 failed; 0 ignored; 7 measured
*/
extern mod extra;
#[deriving(Clone)]
struct PutBack<A, T> {
top: Option<A>,
iter: T
}
impl<A, T> PutBack<A, T> {
#[inline]
pub fn new(it: T) -> PutBack<A, T> {
PutBack{top: None, iter: it}
}
#[inline]
pub fn put_back(&mut self, x: A) {
self.top = Some(x)
}
}
impl<A, T: Iterator<A>> Iterator<A> for PutBack<A, T> {
#[inline]
fn next(&mut self) -> Option<A> {
match self.top.take() {
None => self.iter.next(),
top => top,
}
}
#[inline]
fn size_hint(&self) -> (uint, Option<uint>) {
let (lo, hi) = self.iter.size_hint();
if self.top.is_some() {
(lo + 1, hi.map(|a| *a + 1))
} else { (lo, hi) }
}
}
/// .
pub enum DecoderError {
/// End of input with unfinished character
IncompleteInput,
///Invalid byte in codepoint encoding
InvalidSequence,
}
condition! {
pub decode_error: (super::DecoderError, ~[u8]) -> ~str;
}
/// Handle Decoder Error
pub trait DecoderHandler {
/// The DecoderHandler may fail, raise a condition,
/// or return a replacement string.
fn error_str(&self, msg: DecoderError, sequence: &[u8]) -> ~str;
}
/// Raise the condition `decode_error` when an error is encountered.
/// Decoding continues if the condition is resolved.
#[deriving(Clone)]
pub struct Strict;
/// Use the decoder's default Replacement Character
#[deriving(Clone)]
pub struct ReplaceDefault;
impl DecoderHandler for ReplaceDefault {
#[inline(always)]
fn error_str(&self, _: DecoderError, _: &[u8]) -> ~str {
~"\uFFFD"
}
}
impl DecoderHandler for Strict {
fn error_str(&self, msg: DecoderError, buf: &[u8]) -> ~str {
decode_error::cond.raise((msg, buf.to_owned()))
}
}
enum UTF8Expect {
Ct, // Expecting 80 .. BF (one continuation byte)
E0, // Expecting A0 .. BF then one continuation byte
Et, // Expecting two continuation bytes
ED, // Expecting 80 .. 9F then one continuation byte
F0, // Expecting 90 .. BF then two continuation bytes
Ft, // Expecting three continuation bytes
F4, // Expecting 80 .. 8F then two continuation bytes
}
fn utf8_decode<E: DecoderHandler>(data: &[u8], err: E) -> ~str {
use std::str;
let mut res = std::vec::with_capacity(data.len());
let mut it = PutBack::new(data.iter().map(|&x| x).enumerate());
let mut valid = 0u;
let mut start = None;
let mut expt;
let extend_valid = |i, step| {
if start.is_none() { start = Some(i) }
valid += step;
};
let flush = || {
match start {
Some(st) => res.push_all(data.slice(st, st + valid)),
_ => ()
}
start = None; valid = 0
};
let replace = |msg, slc| {
let repl = err.error_str(msg, slc);
res.push_all(repl.as_bytes());
};
loop {
let mut ch_start;
match it.next() {
None => break,
Some((i, b)) => {
match b {
0x00 .. 0x7F => { extend_valid(i, 1); loop },
0xC2 .. 0xDF => expt = Ct,
0xE0 => expt = E0,
0xE1 .. 0xEC |
0xEE .. 0xEF => expt = Et,
0xED => expt = ED,
0xF0 => expt = F0,
0xF1 .. 0xF3 => expt = Ft,
0xF4 => expt = F4,
_ => {
flush();
replace(InvalidSequence, &[b]);
loop
}
}
ch_start = i;
}
}
loop {
let (i, b) = match it.next() {
None => {
// Error: incomplete stream
flush();
replace(IncompleteInput, data.slice_from(ch_start));
return unsafe { str::raw::from_utf8_owned(res) }
},
Some(x) => x,
};
match (expt, b) {
(E0, 0xA0 .. 0xBF) |
(Et, 0x80 .. 0xBF) |
(ED, 0x80 .. 0x9F) => expt = Ct,
(F0, 0x90 .. 0xBF) |
(Ft, 0x80 .. 0xBF) |
(F4, 0x80 .. 0x8F) => expt = Et,
(Ct, 0x80 .. 0xBF) => {
extend_valid(ch_start, i + 1 - ch_start);
break
},
_ => {
// Error: Invalid continuation byte
flush();
replace(InvalidSequence, data.slice(ch_start, i));
/* To be UTF-8 conformant we MUST not skip over valid start bytes here */
it.put_back((i, b));
break
},
}
}
}
flush();
unsafe { str::raw::from_utf8_owned(res) }
}
static ReplacementChar: char = '\ufffd';
static ReplacementCharStr: &'static str = "\uFFFD";
#[inline(never)]
fn used<T>(_: T) { }
static longer_text: [u8, ..210] =
[76, 105, 110, 103, 117, 105, 115, 116, 105, 99, 115, 32, 97, 110, 100, 32, 100, 105, 99, 116, 105, 111, 110, 97, 114, 105, 101, 115, 58, 10, 10, 32, 32, 195, 176, 105, 32, 196, 177, 110, 116, 201, 153, 203, 136, 110, 195, 166, 202, 131, 201, 153, 110, 201, 153, 108, 32, 102, 201, 153, 203, 136, 110, 201, 155, 116, 196, 177, 107, 32, 201, 153, 115, 111, 202, 138, 115, 105, 203, 136, 101, 196, 177, 202, 131, 110, 10, 32, 32, 89, 32, 91, 203, 136, 202, 143, 112, 115, 105, 108, 201, 148, 110, 93, 44, 32, 89, 101, 110, 32, 91, 106, 201, 155, 110, 93, 44, 32, 89, 111, 103, 97, 32, 91, 203, 136, 106, 111, 203, 144, 103, 201, 145, 93, 10, 10, 65, 80, 76, 58, 10, 10, 32, 32, 40, 40, 86, 226, 141, 179, 86, 41, 61, 226, 141, 179, 226, 141, 180, 86, 41, 47, 86, 226, 134, 144, 44, 86, 32, 32, 32, 32, 226, 140, 183, 226, 134, 144, 226, 141, 179, 226, 134, 146, 226, 141, 180, 226, 136, 134, 226, 136, 135, 226, 138, 131, 226, 128, 190, 226, 141, 142, 226, 141, 149, 226, 140, 136, 10, 10];
static longer_text_one_error: [u8, ..210] =
[76, 105, 110, 103, 117, 105, 115, 116, 105, 99, 115, 32, 97, 110, 100, 32, 100, 105, 99, 116, 105, 111, 110, 97, 114, 105, 101, 115, 58, 10, 10, 32, 32, 195, 176, 105, 32, 196, 177, 110, 116, 201, 153, 203, 136, 110, 195, 166, 202, 131, 201, 153, 110, 201, 153, 108, 32, 102, 201, 153, 203, 136, 110, 201, 155, 116, 196, 177, 107, 32, 201, 153, 115, 111, 202, 138, 115, 105, 203, 136, 101, 196, 177, 202, 131, 110, 10, 32, 32, 89, 32, 91, 203, 136, 202, 143, 112, 115, 105, 108, 201, 148, 110, 93, 44, 32, 89, 101, 110, 32, 91, 106, 201, 155, 110, 93, 255, 32, 89, 111, 103, 97, 32, 91, 203, 136, 106, 111, 203, 144, 103, 201, 145, 93, 10, 10, 65, 80, 76, 58, 10, 10, 32, 32, 40, 40, 86, 226, 141, 179, 86, 41, 61, 226, 141, 179, 226, 141, 180, 86, 41, 47, 86, 226, 134, 144, 44, 86, 32, 32, 32, 32, 226, 140, 183, 226, 134, 144, 226, 141, 179, 226, 134, 146, 226, 141, 180, 226, 136, 134, 226, 136, 135, 226, 138, 131, 226, 128, 190, 226, 141, 142, 226, 141, 149, 226, 140, 136, 10, 10];
#[bench]
fn bench_decode_utf8_replace_none_default(b: &mut extra::test::BenchHarness) {
do b.iter {
used(utf8_decode(longer_text, ReplaceDefault));
}
}
#[bench]
fn bench_decode_utf8_replace_none_condition(b: &mut extra::test::BenchHarness) {
do decode_error::cond.trap(|_| ReplacementCharStr.to_owned()).inside {
do b.iter {
used(utf8_decode(longer_text, Strict));
}
}
}
#[bench]
fn bench_decode_utf8_replace_none_condition_nocatch(b: &mut extra::test::BenchHarness) {
do b.iter {
used(utf8_decode(longer_text, Strict));
}
}
// Set up the condition handler for each iteration
#[bench]
fn bench_decode_utf8_replace_none_condition_no_amortized(b: &mut extra::test::BenchHarness) {
do b.iter {
do decode_error::cond.trap(|_| ReplacementCharStr.to_owned()).inside {
used(utf8_decode(longer_text, Strict));
}
}
}
#[bench]
fn bench_decode_utf8_replace_once_default(b: &mut extra::test::BenchHarness) {
do b.iter {
used(utf8_decode(longer_text_one_error, ReplaceDefault));
}
}
#[bench]
fn bench_decode_utf8_replace_once_condition(b: &mut extra::test::BenchHarness) {
do decode_error::cond.trap(|_| ReplacementCharStr.to_owned()).inside {
do b.iter {
used(utf8_decode(longer_text_one_error, Strict));
}
}
}
// Set up the condition handler for each iteration
#[bench]
fn bench_decode_utf8_replace_once_condition_no_amortized(b: &mut extra::test::BenchHarness) {
do b.iter {
do decode_error::cond.trap(|_| ReplacementCharStr.to_owned()).inside {
used(utf8_decode(longer_text_one_error, Strict));
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment