uint32 state = BGN;
/*
- * Sixteen seems to give the best balance of performance across different
- * byte distributions.
+ * With a stride of two vector widths, gcc will unroll the loop. Even if
+ * the compiler can unroll a longer loop, it's not worth it because we
+ * must fall back to the byte-wise algorithm if we find any non-ASCII.
*/
-#define STRIDE_LENGTH 16
+#define STRIDE_LENGTH (2 * sizeof(Vector8))
if (len >= STRIDE_LENGTH)
{
#ifndef PG_WCHAR_H
#define PG_WCHAR_H
+#include "port/simd.h"
+
/*
* The pg_wchar type
*/
* Verify a chunk of bytes for valid ASCII.
*
* Returns false if the input contains any zero bytes or bytes with the
- * high-bit set. Input len must be a multiple of 8.
+ * high-bit set. Input len must be a multiple of the chunk size (8 or 16).
*/
static inline bool
is_valid_ascii(const unsigned char *s, int len)
{
const unsigned char *const s_end = s + len;
- uint64 chunk,
- highbit_cum = UINT64CONST(0),
- zero_cum = UINT64CONST(0x8080808080808080);
+ Vector8 chunk;
+ Vector8 highbit_cum = vector8_broadcast(0);
+#ifdef USE_NO_SIMD
+ Vector8 zero_cum = vector8_broadcast(0x80);
+#endif
Assert(len % sizeof(chunk) == 0);
while (s < s_end)
{
- memcpy(&chunk, s, sizeof(chunk));
+ vector8_load(&chunk, s);
+
+ /* Capture any zero bytes in this chunk. */
+#ifdef USE_NO_SIMD
/*
- * Capture any zero bytes in this chunk.
- *
* First, add 0x7f to each byte. This sets the high bit in each byte,
* unless it was a zero. If any resulting high bits are zero, the
* corresponding high bits in the zero accumulator will be cleared.
* any input bytes did have the high bit set, it doesn't matter
* because we check for those separately.
*/
- zero_cum &= (chunk + UINT64CONST(0x7f7f7f7f7f7f7f7f));
+ zero_cum &= (chunk + vector8_broadcast(0x7F));
+#else
+
+ /*
+ * Set all bits in each lane of the highbit accumulator where input
+ * bytes are zero.
+ */
+ highbit_cum = vector8_or(highbit_cum,
+ vector8_eq(chunk, vector8_broadcast(0)));
+#endif
/* Capture all set bits in this chunk. */
- highbit_cum |= chunk;
+ highbit_cum = vector8_or(highbit_cum, chunk);
s += sizeof(chunk);
}
/* Check if any high bits in the high bit accumulator got set. */
- if (highbit_cum & UINT64CONST(0x8080808080808080))
+ if (vector8_is_highbit_set(highbit_cum))
return false;
+#ifdef USE_NO_SIMD
/* Check if any high bits in the zero accumulator got cleared. */
- if (zero_cum != UINT64CONST(0x8080808080808080))
+ if (zero_cum != vector8_broadcast(0x80))
return false;
+#endif
return true;
}
static inline bool vector8_has(const Vector8 v, const uint8 c);
static inline bool vector8_has_zero(const Vector8 v);
static inline bool vector8_has_le(const Vector8 v, const uint8 c);
+static inline bool vector8_is_highbit_set(const Vector8 v);
+/* arithmetic operations */
+static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2);
+
+/* Different semantics for SIMD architectures. */
+#ifndef USE_NO_SIMD
+
+/* comparisons between vectors */
+static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2);
+
+#endif /* ! USE_NO_SIMD */
/*
* Load a chunk of memory into the given vector.
return result;
}
+/*
+ * Return true if the high bit of any element is set
+ */
+static inline bool
+vector8_is_highbit_set(const Vector8 v)
+{
+#ifdef USE_SSE2
+ return _mm_movemask_epi8(v) != 0;
+#else
+ return v & vector8_broadcast(0x80);
+#endif
+}
+
+/*
+ * Return the bitwise OR of the inputs
+ */
+static inline Vector8
+vector8_or(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+ return _mm_or_si128(v1, v2);
+#else
+ return v1 | v2;
+#endif
+}
+
+
+/* Different semantics for SIMD architectures. */
+#ifndef USE_NO_SIMD
+
+/*
+ * Return a vector with all bits set in each lane where the the corresponding
+ * lanes in the inputs are equal.
+ */
+static inline Vector8
+vector8_eq(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+ return _mm_cmpeq_epi8(v1, v2);
+#endif
+}
+
+#endif /* ! USE_NO_SIMD */
+
#endif /* SIMD_H */